diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index da121f06f..b0b17b1a5 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -247,7 +247,7 @@ check_disk(const char *path, blkid_cache cache, int force, * not easily decode the MBR return a failure and prompt to the * user to use force option since we cannot check the partitions. */ - if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) { check_error(errno); return -1; } @@ -306,7 +306,7 @@ check_disk(const char *path, blkid_cache cache, int force, efi_free(vtoc); (void) close(fd); - return (err); + return (err); } static int @@ -353,7 +353,7 @@ is_whole_disk(const char *path) struct dk_gpt *label; int fd; - if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { (void) close(fd); @@ -390,6 +390,58 @@ is_shorthand_path(const char *arg, char *path, return (error); } +/* + * Determine if the given path is a hot spare within the given configuration. + * If no configuration is given we rely solely on the label. + */ +static boolean_t +is_spare(nvlist_t *config, const char *path) +{ + int fd; + pool_state_t state; + char *name = NULL; + nvlist_t *label; + uint64_t guid, spareguid; + nvlist_t *nvroot; + nvlist_t **spares; + uint_t i, nspares; + boolean_t inuse; + + if ((fd = open(path, O_RDONLY)) < 0) + return (B_FALSE); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || + !inuse || + state != POOL_STATE_SPARE || + zpool_read_label(fd, &label) != 0) { + free(name); + (void) close(fd); + return (B_FALSE); + } + free(name); + (void) close(fd); + + if (config == NULL) + return (B_TRUE); + + verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); + nvlist_free(label); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + verify(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &spareguid) == 0); + if (spareguid == guid) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Create a leaf vdev. Determine if this is a file or a device. If it's a * device, fill in the device id to make a complete nvlist. Valid forms for a @@ -914,11 +966,13 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) { nvlist_t **child; uint_t c, children; - char *type, *path, *diskname; + char *type, *path; char devpath[MAXPATHLEN]; char udevpath[MAXPATHLEN]; uint64_t wholedisk; struct stat64 statbuf; + int is_exclusive = 0; + int fd; int ret; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); @@ -941,8 +995,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) &wholedisk)); if (!wholedisk) { - ret = zero_label(path); - return (ret); + (void) zero_label(path); + return (0); } if (realpath(path, devpath) == NULL) { @@ -964,26 +1018,44 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) strncpy(udevpath, path, MAXPATHLEN); (void) zfs_append_partition(udevpath, MAXPATHLEN); - if ((strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) && - (lstat64(udevpath, &statbuf) == 0) && - S_ISLNK(statbuf.st_mode)) - (void) unlink(udevpath); - - diskname = strrchr(devpath, '/'); - assert(diskname != NULL); - diskname++; - if (zpool_label_disk(g_zfs, zhp, diskname) == -1) - return (-1); + fd = open(devpath, O_RDWR|O_EXCL); + if (fd == -1) { + if (errno == EBUSY) + is_exclusive = 1; + } else { + (void) close(fd); + } /* - * Now we've labeled the disk and the partitions have been - * created. We still need to wait for udev to create the - * symlinks to those partitions. + * If the partition exists, contains a valid spare label, + * and is opened exclusively there is no need to partition + * it. Hot spares have already been partitioned and are + * held open exclusively by the kernel as a safety measure. + * + * If the provided path is for a /dev/disk/ device its + * symbolic link will be removed, partition table created, + * and then block until udev creates the new link. */ - if ((ret = zpool_label_disk_wait(udevpath, 1000)) != 0) { - (void) fprintf(stderr, - gettext( "cannot resolve path '%s'\n"), udevpath); - return (-1); + if (!is_exclusive || !is_spare(NULL, udevpath)) { + ret = strncmp(udevpath,UDISK_ROOT,strlen(UDISK_ROOT)); + if (ret == 0) { + ret = lstat64(udevpath, &statbuf); + if (ret == 0 && S_ISLNK(statbuf.st_mode)) + (void) unlink(udevpath); + } + + if (zpool_label_disk(g_zfs, zhp, + strrchr(devpath, '/') + 1) == -1) + return (-1); + + ret = zpool_label_disk_wait(udevpath, 1000); + if (ret) { + (void) fprintf(stderr, gettext("cannot " + "resolve path '%s': %d\n"), udevpath, ret); + return (-1); + } + + (void) zero_label(udevpath); } /* @@ -994,9 +1066,6 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) */ verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); - /* Just in case this partition already existed. */ - (void) zero_label(udevpath); - return (0); } @@ -1019,54 +1088,6 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) return (0); } -/* - * Determine if the given path is a hot spare within the given configuration. - */ -static boolean_t -is_spare(nvlist_t *config, const char *path) -{ - int fd; - pool_state_t state; - char *name = NULL; - nvlist_t *label; - uint64_t guid, spareguid; - nvlist_t *nvroot; - nvlist_t **spares; - uint_t i, nspares; - boolean_t inuse; - - if ((fd = open(path, O_RDONLY|O_EXCL)) < 0) - return (B_FALSE); - - if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || - !inuse || - state != POOL_STATE_SPARE || - zpool_read_label(fd, &label) != 0) { - free(name); - (void) close(fd); - return (B_FALSE); - } - free(name); - (void) close(fd); - - verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); - nvlist_free(label); - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - verify(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &spareguid) == 0); - if (spareguid == guid) - return (B_TRUE); - } - } - - return (B_FALSE); -} - /* * Go through and find any devices that are in use. We rely on libdiskmgt for * the majority of this task. @@ -1098,11 +1119,12 @@ check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, * regardless of what libblkid or zpool_in_use() says. */ if (replacing) { - if (wholedisk) - (void) snprintf(buf, sizeof (buf), "%ss0", - path); - else - (void) strlcpy(buf, path, sizeof (buf)); + (void) strlcpy(buf, path, sizeof (buf)); + if (wholedisk) { + ret = zfs_append_partition(buf, sizeof (buf)); + if (ret == -1) + return (-1); + } if (is_spare(config, buf)) return (0); diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h index 9d3e6f07c..47f569bbd 100644 --- a/include/linux/blkdev_compat.h +++ b/include/linux/blkdev_compat.h @@ -478,4 +478,13 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) */ #define VDEV_SCHEDULER "noop" +/* + * A common holder for vdev_bdev_open() is used to relax the exclusive open + * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to + * allow them to open the device multiple times. Other kernel callers and + * user space processes which don't pass this value will get EBUSY. This is + * currently required for the correct operation of hot spares. + */ +#define VDEV_HOLDER ((void *)0x2f5401de7) + #endif /* _ZFS_BLKDEV_H */ diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index f93b7bcc8..1f6507f2f 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -34,6 +34,7 @@ #include char *zfs_vdev_scheduler = VDEV_SCHEDULER; +static void *zfs_vdev_holder = VDEV_HOLDER; /* * Virtual device vector for disks. @@ -203,7 +204,7 @@ vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) struct gendisk *disk; int error, partno; - bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), vd); + bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); if (IS_ERR(bdev)) return bdev; @@ -281,7 +282,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, if (v->vdev_wholedisk && v->vdev_expanding) bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); if (IS_ERR(bdev)) - bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd); + bdev = vdev_bdev_open(v->vdev_path, + vdev_bdev_mode(mode), zfs_vdev_holder); if (IS_ERR(bdev)) { kmem_free(vd, sizeof(vdev_disk_t)); return -PTR_ERR(bdev); @@ -783,7 +785,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) uint64_t s, size; int i; - bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL); + bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), zfs_vdev_holder); if (IS_ERR(bdev)) return -PTR_ERR(bdev);