From d603ed6c278f9c25b17ba8e75e9bce6e5d715ac0 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 26 Aug 2010 11:56:53 -0700 Subject: [PATCH] Add linux user disk support This topic branch contains all the changes needed to integrate the user side zfs tools with Linux style devices. Primarily this includes fixing up the Solaris libefi library to be Linux friendly, and integrating with the libblkid library which is provided by e2fsprogs. Signed-off-by: Brian Behlendorf --- cmd/zdb/zdb.c | 1 - cmd/zfs/zfs_main.c | 37 ++ cmd/zinject/zinject.c | 5 +- cmd/zpool/zpool_main.c | 28 +- cmd/zpool/zpool_vdev.c | 567 ++++++++++++++++++------------- lib/libefi/include/sys/uuid.h | 4 - lib/libefi/rdwr_efi.c | 384 ++++++++++++++++++--- lib/libzfs/include/libzfs.h | 24 +- lib/libzfs/include/libzfs_impl.h | 2 + lib/libzfs/libzfs_changelist.c | 12 + lib/libzfs/libzfs_dataset.c | 327 +++++++++++++++++- lib/libzfs/libzfs_import.c | 370 +++++++------------- lib/libzfs/libzfs_mount.c | 51 +++ lib/libzfs/libzfs_pool.c | 256 ++++++-------- lib/libzfs/libzfs_sendrecv.c | 10 + lib/libzfs/libzfs_util.c | 3 + lib/libzpool/kernel.c | 34 ++ module/zfs/sa.c | 5 +- module/zfs/zfs_sa.c | 3 +- 19 files changed, 1402 insertions(+), 721 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 459445bb7..d0354809d 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -55,7 +55,6 @@ #include #include #undef ZFS_MAXNAMELEN -#undef verify #include #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9cb69c1bf..ebcec2d6e 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -318,6 +318,7 @@ safe_malloc(size_t size) return (data); } +#ifdef HAVE_ZPL static char * safe_strdup(char *str) { @@ -328,6 +329,7 @@ safe_strdup(char *str) return (dupstr); } +#endif /* HAVE_ZPL */ /* * Callback routine that will print out information for each of @@ -495,6 +497,7 @@ parse_depth(char *opt, int *flags) #define PROGRESS_DELAY 2 /* seconds */ +#ifdef HAVE_ZPL static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; @@ -546,6 +549,8 @@ finish_progress(char *done) free(pt_header); pt_header = NULL; } +#endif /* HAVE_ZPL */ + /* * zfs clone [-p] [-o prop=value] ... * @@ -626,6 +631,7 @@ zfs_do_clone(int argc, char **argv) ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ +#ifdef HAVE_ZPL if (ret == 0) { zfs_handle_t *clone; @@ -637,6 +643,7 @@ zfs_do_clone(int argc, char **argv) zfs_close(clone); } } +#endif /* HAVE_ZPL */ zfs_close(zhp); nvlist_free(props); @@ -824,6 +831,7 @@ zfs_do_create(int argc, char **argv) * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. */ +#ifdef HAVE_ZPL if (canmount == ZFS_CANMOUNT_ON) { if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " @@ -835,6 +843,7 @@ zfs_do_create(int argc, char **argv) ret = 1; } } +#endif /* HAVE_ZPL */ error: if (zhp) @@ -2940,6 +2949,7 @@ zfs_do_release(int argc, char **argv) #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 5 /* seconds */ +#ifdef HAVE_ZPL static int get_one_dataset(zfs_handle_t *zhp, void *data) { @@ -3387,6 +3397,7 @@ share_mount(int op, int argc, char **argv) return (ret); } +#endif /* HAVE_ZPL */ /* * zfs mount -a [nfs] @@ -3397,7 +3408,11 @@ share_mount(int op, int argc, char **argv) static int zfs_do_mount(int argc, char **argv) { +#ifdef HAVE_ZPL return (share_mount(OP_MOUNT, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } /* @@ -3409,9 +3424,14 @@ zfs_do_mount(int argc, char **argv) static int zfs_do_share(int argc, char **argv) { +#ifdef HAVE_ZPL return (share_mount(OP_SHARE, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } +#ifdef HAVE_ZPL typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; @@ -3795,6 +3815,7 @@ unshare_unmount(int op, int argc, char **argv) return (ret); } +#endif /* HAVE_ZPL */ /* * zfs unmount -a @@ -3805,7 +3826,11 @@ unshare_unmount(int op, int argc, char **argv) static int zfs_do_unmount(int argc, char **argv) { +#ifdef HAVE_ZPL return (unshare_unmount(OP_MOUNT, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } /* @@ -3817,7 +3842,11 @@ zfs_do_unmount(int argc, char **argv) static int zfs_do_unshare(int argc, char **argv) { +#ifdef HAVE_ZPL return (unshare_unmount(OP_SHARE, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } /* ARGSUSED */ @@ -3833,6 +3862,7 @@ zfs_do_python(int argc, char **argv) * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. */ +#ifdef HAVE_ZPL static int manual_mount(int argc, char **argv) { @@ -3963,6 +3993,7 @@ manual_unmount(int argc, char **argv) return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE)); } +#endif /* HAVE_ZPL */ static int find_command_idx(char *command, int *idx) @@ -4061,7 +4092,9 @@ main(int argc, char **argv) { int ret; int i = 0; +#ifdef HAVE_ZPL char *progname; +#endif char *cmdname; (void) setlocale(LC_ALL, ""); @@ -4086,6 +4119,7 @@ main(int argc, char **argv) return (1); } +#ifdef HAVE_ZPL /* * This command also doubles as the /etc/fs mount and unmount program. * Determine if we should take this behavior based on argv[0]. @@ -4096,6 +4130,9 @@ main(int argc, char **argv) } else if (strcmp(progname, "umount") == 0) { ret = manual_unmount(argc, argv); } else { +#else + { +#endif /* HAVE_ZPL */ /* * Make sure the user has specified some command. */ diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 60c53ceb3..643d73e7f 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -954,17 +954,20 @@ main(int argc, char **argv) if (dataset[0] != '\0' && domount) { if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) return (1); - +#ifdef HAVE_ZPL if (zfs_unmount(zhp, NULL, 0) != 0) return (1); +#endif /* HAVE_ZPL */ } record.zi_error = error; ret = register_handler(pool, flags, &record, quiet); +#ifdef HAVE_ZPL if (dataset[0] != '\0' && domount) ret = (zfs_mount(zhp, NULL, 0) != 0); +#endif /* HAVE_ZPL */ libzfs_fini(g_zfs); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 074f76e81..b1b71acf8 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -716,7 +716,9 @@ zpool_do_create(int argc, char **argv) (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; +#ifdef HAVE_ZPL DIR *dirp; +#endif if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " @@ -741,6 +743,7 @@ zpool_do_create(int argc, char **argv) mountpoint); } +#ifdef HAVE_ZPL if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); @@ -763,6 +766,7 @@ zpool_do_create(int argc, char **argv) goto errout; } } +#endif /* HAVE_ZPL */ } if (dryrun) { @@ -793,8 +797,12 @@ zpool_do_create(int argc, char **argv) zfs_prop_to_name( ZFS_PROP_MOUNTPOINT), mountpoint) == 0); +#ifdef HAVE_ZPL if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); +#else + ret = 0; +#endif /* HAVE_ZPL */ zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { @@ -1571,12 +1579,14 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); +#if HAVE_ZPL if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } +#endif /* HAVE_ZPL */ zpool_close(zhp); return (0); @@ -1592,7 +1602,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, * -c Read pool information from a cachefile instead of searching * devices. * - * -d Scan in a specific directory, other than /dev/dsk. More than + * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only @@ -1773,12 +1783,6 @@ zpool_do_import(int argc, char **argv) nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) goto error; - if (searchdirs == NULL) { - searchdirs = safe_malloc(sizeof (char *)); - searchdirs[0] = "/dev/dsk"; - nsearch = 1; - } - /* check argument count */ if (do_all) { if (argc != 0) { @@ -1799,7 +1803,9 @@ zpool_do_import(int argc, char **argv) if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); - free(searchdirs); + if (searchdirs != NULL) + free(searchdirs); + nvlist_free(policy); return (1); } @@ -1867,7 +1873,8 @@ zpool_do_import(int argc, char **argv) } if (err == 1) { - free(searchdirs); + if (searchdirs != NULL) + free(searchdirs); nvlist_free(policy); return (1); } @@ -1968,7 +1975,8 @@ error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); - free(searchdirs); + if (searchdirs != NULL) + free(searchdirs); return (err ? 1 : 0); } diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 0c224903d..febdda95f 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -50,7 +50,7 @@ * * 1. Construct the vdev specification. Performs syntax validation and * makes sure each device is valid. - * 2. Check for devices in use. Using libdiskmgt, makes sure that no + * 2. Check for devices in use. Using libblkid to make sure that no * devices are also in use. Some can be overridden using the 'force' * flag, others cannot. * 3. Check for replication errors if the 'force' flag is not specified. @@ -60,10 +60,10 @@ */ #include +#include #include #include #include -#include #include #include #include @@ -74,13 +74,15 @@ #include #include #include +#include +#ifdef HAVE_LIBBLKID +#include +#else +#define blkid_cache void * +#endif /* HAVE_LIBBLKID */ #include "zpool_util.h" -#define DISK_ROOT "/dev/dsk" -#define RDISK_ROOT "/dev/rdsk" -#define BACKUP_SLICE "s2" - /* * For any given vdev specification, we can have multiple errors. The * vdev_error() function keeps track of whether we have seen an error yet, and @@ -111,168 +113,6 @@ vdev_error(const char *fmt, ...) va_end(ap); } -static void -libdiskmgt_error(int error) -{ - /* - * ENXIO/ENODEV is a valid error message if the device doesn't live in - * /dev/dsk. Don't bother printing an error message in this case. - */ - if (error == ENXIO || error == ENODEV) - return; - - (void) fprintf(stderr, gettext("warning: device in use checking " - "failed: %s\n"), strerror(error)); -} - -/* - * Validate a device, passing the bulk of the work off to libdiskmgt. - */ -static int -check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) -{ - char *msg; - int error = 0; - dm_who_type_t who; - - if (force) - who = DM_WHO_ZPOOL_FORCE; - else if (isspare) - who = DM_WHO_ZPOOL_SPARE; - else - who = DM_WHO_ZPOOL; - - if (dm_inuse((char *)path, &msg, who, &error) || error) { - if (error != 0) { - libdiskmgt_error(error); - return (0); - } else { - vdev_error("%s", msg); - free(msg); - return (-1); - } - } - - /* - * If we're given a whole disk, ignore overlapping slices since we're - * about to label it anyway. - */ - error = 0; - if (!wholedisk && !force && - (dm_isoverlapping((char *)path, &msg, &error) || error)) { - if (error == 0) { - /* dm_isoverlapping returned -1 */ - vdev_error(gettext("%s overlaps with %s\n"), path, msg); - free(msg); - return (-1); - } else if (error != ENODEV) { - /* libdiskmgt's devcache only handles physical drives */ - libdiskmgt_error(error); - return (0); - } - } - - return (0); -} - - -/* - * Validate a whole disk. Iterate over all slices on the disk and make sure - * that none is in use by calling check_slice(). - */ -static int -check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) -{ - dm_descriptor_t *drive, *media, *slice; - int err = 0; - int i; - int ret; - - /* - * Get the drive associated with this disk. This should never fail, - * because we already have an alias handle open for the device. - */ - if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, - &err)) == NULL || *drive == NULL) { - if (err) - libdiskmgt_error(err); - return (0); - } - - if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, - &err)) == NULL) { - dm_free_descriptors(drive); - if (err) - libdiskmgt_error(err); - return (0); - } - - dm_free_descriptors(drive); - - /* - * It is possible that the user has specified a removable media drive, - * and the media is not present. - */ - if (*media == NULL) { - dm_free_descriptors(media); - vdev_error(gettext("'%s' has no media in drive\n"), name); - return (-1); - } - - if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, - &err)) == NULL) { - dm_free_descriptors(media); - if (err) - libdiskmgt_error(err); - return (0); - } - - dm_free_descriptors(media); - - ret = 0; - - /* - * Iterate over all slices and report any errors. We don't care about - * overlapping slices because we are using the whole disk. - */ - for (i = 0; slice[i] != NULL; i++) { - char *name = dm_get_name(slice[i], &err); - - if (check_slice(name, force, B_TRUE, isspare) != 0) - ret = -1; - - dm_free_name(name); - } - - dm_free_descriptors(slice); - return (ret); -} - -/* - * Validate a device. - */ -static int -check_device(const char *path, boolean_t force, boolean_t isspare) -{ - dm_descriptor_t desc; - int err; - char *dev; - - /* - * For whole disks, libdiskmgt does not include the leading dev path. - */ - dev = strrchr(path, '/'); - assert(dev != NULL); - dev++; - if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { - err = check_disk(path, desc, force, isspare); - dm_free_descriptor(desc); - return (err); - } - - return (check_slice(path, force, B_FALSE, isspare)); -} - /* * Check that a file is valid. All we can do in this case is check that it's * not in use by another pool, and not in use by swap. @@ -283,19 +123,9 @@ check_file(const char *file, boolean_t force, boolean_t isspare) char *name; int fd; int ret = 0; - int err; pool_state_t state; boolean_t inuse; - if (dm_inuse_swap(file, &err)) { - if (err) - libdiskmgt_error(err); - else - vdev_error(gettext("%s is currently used by swap. " - "Please see swap(1M).\n"), file); - return (-1); - } - if ((fd = open(file, O_RDONLY)) < 0) return (0); @@ -348,6 +178,175 @@ check_file(const char *file, boolean_t force, boolean_t isspare) return (ret); } +static void +check_error(int err) +{ + (void) fprintf(stderr, gettext("warning: device in use checking " + "failed: %s\n"), strerror(err)); +} + +static int +check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) +{ + struct stat64 statbuf; + int err; +#ifdef HAVE_LIBBLKID + char *value; +#endif /* HAVE_LIBBLKID */ + + if (stat64(path, &statbuf) != 0) { + vdev_error(gettext("cannot stat %s: %s\n"), + path, strerror(errno)); + return (-1); + } + +#ifdef HAVE_LIBBLKID + /* No valid type detected device is safe to use */ + value = blkid_get_tag_value(cache, "TYPE", path); + if (value == NULL) + return (0); + + /* + * If libblkid detects a ZFS device, we check the device + * using check_file() to see if it's safe. The one safe + * case is a spare device shared between multiple pools. + */ + if (strcmp(value, "zfs") == 0) { + err = check_file(path, force, isspare); + } else { + if (force) { + err = 0; + } else { + err = -1; + vdev_error(gettext("%s contains a filesystem of " + "type '%s'\n"), path, value); + } + } + + free(value); +#else + err = check_file(path, force, isspare); +#endif /* HAVE_LIBBLKID */ + + return (err); +} + +/* + * Validate a whole disk. Iterate over all slices on the disk and make sure + * that none is in use by calling check_slice(). + */ +static int +check_disk(const char *path, blkid_cache cache, int force, + boolean_t isspare, boolean_t iswholedisk) +{ + struct dk_gpt *vtoc; + char slice_path[MAXPATHLEN]; + int err = 0; + int fd, i; + + /* This is not a wholedisk we only check the given partition */ + if (!iswholedisk) + return check_slice(path, cache, force, isspare); + + /* + * When the device is a whole disk try to read the efi partition + * label. If this is successful we safely check the all of the + * partitions. However, when it fails it may simply be because + * the disk is partitioned via the MBR. Since we currently can + * not easily decode the MBR return a failure and prompt to the + * user to use force option since we cannot check the partitions. + */ + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { + check_error(errno); + return -1; + } + + if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { + (void) close(fd); + + if (force) { + return 0; + } else { + vdev_error(gettext("%s does not contain an EFI " + "label but it may contain partition\n" + "information in the MBR.\n"), path); + return -1; + } + } + + /* + * The primary efi partition label is damaged however the secondary + * label at the end of the device is intact. Rather than use this + * label we should play it safe and treat this as a non efi device. + */ + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + + if (force) { + /* Partitions will no be created using the backup */ + return 0; + } else { + vdev_error(gettext("%s contains a corrupt primary " + "EFI label.\n"), path); + return -1; + } + } + + for (i = 0; i < vtoc->efi_nparts; i++) { + + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED || + uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) + continue; + + if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, "-part", i+1); + else + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, isdigit(path[strlen(path)-1]) ? + "p" : "", i+1); + + err = check_slice(slice_path, cache, force, isspare); + if (err) + break; + } + + efi_free(vtoc); + (void) close(fd); + + return (err); +} + +static int +check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk) +{ + static blkid_cache cache = NULL; + +#ifdef HAVE_LIBBLKID + /* + * There is no easy way to add a correct blkid_put_cache() call, + * memory will be reclaimed when the command exits. + */ + if (cache == NULL) { + int err; + + if ((err = blkid_get_cache(&cache, NULL)) != 0) { + check_error(err); + return -1; + } + + if ((err = blkid_probe_all(cache)) != 0) { + blkid_put_cache(cache); + check_error(err); + return -1; + } + } +#endif /* HAVE_LIBBLKID */ + + return check_disk(path, cache, force, isspare, iswholedisk); +} /* * By "whole disk" we mean an entire physical disk (something we can @@ -358,15 +357,12 @@ check_file(const char *file, boolean_t force, boolean_t isspare) * it isn't. */ static boolean_t -is_whole_disk(const char *arg) +is_whole_disk(const char *path) { struct dk_gpt *label; int fd; - char path[MAXPATHLEN]; - (void) snprintf(path, sizeof (path), "%s%s%s", - RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { (void) close(fd); @@ -377,14 +373,52 @@ is_whole_disk(const char *arg) return (B_TRUE); } +/* + * This may be a shorthand device path or it could be total gibberish. + * Check to see if it's a known device in /dev/, /dev/disk/by-id, + * /dev/disk/by-label, /dev/disk/by-path, /dev/disk/by-uuid, or + * /dev/disk/zpool/. As part of this check, see if we've been given + * an entire disk (minus the slice number). + */ +static int +is_shorthand_path(const char *arg, char *path, + struct stat64 *statbuf, boolean_t *wholedisk) +{ + char dirs[5][8] = {"by-id", "by-label", "by-path", "by-uuid", "zpool"}; + int i, err; + + /* /dev/ */ + (void) snprintf(path, MAXPATHLEN, "%s/%s", DISK_ROOT, arg); + *wholedisk = is_whole_disk(path); + err = stat64(path, statbuf); + if (*wholedisk || err == 0) + return (0); + + /* /dev/disk// */ + for (i = 0; i < 5; i++) { + (void) snprintf(path, MAXPATHLEN, "%s/%s/%s", + UDISK_ROOT, dirs[i], arg); + *wholedisk = is_whole_disk(path); + err = stat64(path, statbuf); + if (*wholedisk || err == 0) + return (0); + } + + strlcpy(path, arg, sizeof(path)); + memset(statbuf, 0, sizeof(*statbuf)); + *wholedisk = B_FALSE; + + return (ENOENT); +} + /* * Create a leaf vdev. Determine if this is a file or a device. If it's a * device, fill in the device id to make a complete nvlist. Valid forms for a * leaf vdev are: * - * /dev/dsk/xxx Complete disk path + * /dev/xxx Complete disk path * /xxx Full path to file - * xxx Shorthand for /dev/dsk/xxx + * xxx Shorthand for /dev/disk/yyy/xxx */ static nvlist_t * make_leaf_vdev(const char *arg, uint64_t is_log) @@ -394,6 +428,7 @@ make_leaf_vdev(const char *arg, uint64_t is_log) nvlist_t *vdev = NULL; char *type = NULL; boolean_t wholedisk = B_FALSE; + int err; /* * Determine what type of vdev this is, and put the full path into @@ -403,28 +438,31 @@ make_leaf_vdev(const char *arg, uint64_t is_log) if (arg[0] == '/') { /* * Complete device or file path. Exact type is determined by - * examining the file descriptor afterwards. + * examining the file descriptor afterwards. Symbolic links + * are resolved to their real paths for the is_whole_disk() + * and S_ISBLK/S_ISREG type checks. However, we are careful + * to store the given path as ZPOOL_CONFIG_PATH to ensure we + * can leverage udev's persistent device labels. */ - wholedisk = is_whole_disk(arg); - if (!wholedisk && (stat64(arg, &statbuf) != 0)) { + if (realpath(arg, path) == NULL) { (void) fprintf(stderr, - gettext("cannot open '%s': %s\n"), - arg, strerror(errno)); + gettext("cannot resolve path '%s'\n"), arg); return (NULL); } - (void) strlcpy(path, arg, sizeof (path)); - } else { - /* - * This may be a short path for a device, or it could be total - * gibberish. Check to see if it's a known device in - * /dev/dsk/. As part of this check, see if we've been given a - * an entire disk (minus the slice number). - */ - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, - arg); wholedisk = is_whole_disk(path); if (!wholedisk && (stat64(path, &statbuf) != 0)) { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } + + /* After is_whole_disk() check restore original passed path */ + strlcpy(path, arg, MAXPATHLEN); + } else { + err = is_shorthand_path(arg, path, &statbuf, &wholedisk); + if (err != 0) { /* * If we got ENOENT, then the user gave us * gibberish, so try to direct them with a @@ -432,7 +470,7 @@ make_leaf_vdev(const char *arg, uint64_t is_log) * regurgitate strerror() since it's the best we * can do. */ - if (errno == ENOENT) { + if (err == ENOENT) { (void) fprintf(stderr, gettext("cannot open '%s': no such " "device in %s\n"), arg, DISK_ROOT); @@ -475,6 +513,7 @@ make_leaf_vdev(const char *arg, uint64_t is_log) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); +#if defined(__sun__) || defined(__sun) /* * For a whole disk, defer getting its devid until after labeling it. */ @@ -486,7 +525,7 @@ make_leaf_vdev(const char *arg, uint64_t is_log) ddi_devid_t devid; char *minor = NULL, *devid_str = NULL; - if ((fd = open(path, O_RDONLY)) < 0) { + if ((fd = open(path, O_RDONLY|O_EXCL)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': " "%s\n"), path, strerror(errno)); nvlist_free(vdev); @@ -509,6 +548,7 @@ make_leaf_vdev(const char *arg, uint64_t is_log) (void) close(fd); } +#endif return (vdev); } @@ -871,6 +911,39 @@ check_replication(nvlist_t *config, nvlist_t *newroot) return (ret); } +static int +zero_label(char *path) +{ + const int size = 4096; + char buf[size]; + int err, fd; + + if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { + (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (-1); + } + + memset(buf, 0, size); + err = write(fd, buf, size); + (void) fdatasync(fd); + (void) close(fd); + + if (err == -1) { + (void) fprintf(stderr, gettext("cannot zero first %d bytes " + "of '%s': %s\n"), size, path, strerror(errno)); + return (-1); + } + + if (err != size) { + (void) fprintf(stderr, gettext("could only zero %d/%d bytes " + "of '%s'\n"), err, size, path); + return (-1); + } + + return 0; +} + /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this @@ -889,10 +962,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) char *type, *path, *diskname; char buf[MAXPATHLEN]; uint64_t wholedisk; - int fd; int ret; - ddi_devid_t devid; - char *minor = NULL, *devid_str = NULL; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); @@ -903,55 +973,66 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) return (0); /* - * We have a disk device. Get the path to the device - * and see if it's a whole disk by appending the backup - * slice and stat()ing the device. + * We have a disk device. If this is a whole disk write + * out the efi partition table, otherwise write zero's to + * the first 4k of the partition. This is to ensure that + * libblkid will not misidentify the partition due to a + * magic value left by the previous filesystem. */ - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) != 0 || !wholedisk) - return (0); + verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); + verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk)); - diskname = strrchr(path, '/'); + if (!wholedisk) { + ret = zero_label(path); + return (ret); + } + + if (realpath(path, buf) == NULL) { + ret = errno; + (void) fprintf(stderr, + gettext("cannot resolve path '%s'\n"), path); + return (ret); + } + + diskname = strrchr(buf, '/'); assert(diskname != NULL); diskname++; if (zpool_label_disk(g_zfs, zhp, diskname) == -1) return (-1); /* - * Fill in the devid, now that we've labeled the disk. + * Now the we've labeled the disk and the partitions have + * been created. We still need to wait for udev to create + * the symlinks to those partitions. If we are accessing + * the devices via a udev disk path, /dev/disk, then wait + * for *-part# to be created. Otherwise just use the normal + * syntax for devices in /dev. */ - (void) snprintf(buf, sizeof (buf), "%ss0", path); - if ((fd = open(buf, O_RDONLY)) < 0) { + if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) + (void) snprintf(buf, sizeof (buf), + "%s%s%s", path, "-part", FIRST_SLICE); + else + (void) snprintf(buf, sizeof (buf), + "%s%s%s", path, isdigit(path[strlen(path)-1]) ? + "p" : "", FIRST_SLICE); + + if ((ret = zpool_label_disk_wait(buf, 1000)) != 0) { (void) fprintf(stderr, - gettext("cannot open '%s': %s\n"), - buf, strerror(errno)); + gettext( "cannot resolve path '%s'\n"), buf); return (-1); } - if (devid_get(fd, &devid) == 0) { - if (devid_get_minor_name(fd, &minor) == 0 && - (devid_str = devid_str_encode(devid, minor)) != - NULL) { - verify(nvlist_add_string(nv, - ZPOOL_CONFIG_DEVID, devid_str) == 0); - } - if (devid_str != NULL) - devid_str_free(devid_str); - if (minor != NULL) - devid_str_free(minor); - devid_free(devid); - } - /* - * Update the path to refer to the 's0' slice. The presence of + * Update the path to refer to FIRST_SLICE. The presence of * the 'whole_disk' field indicates to the CLI that we should * chop off the slice number when displaying the device in * future output. */ verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); - (void) close(fd); + /* Just in case this partition already existed. */ + (void) zero_label(buf); return (0); } @@ -991,7 +1072,7 @@ is_spare(nvlist_t *config, const char *path) uint_t i, nspares; boolean_t inuse; - if ((fd = open(path, O_RDONLY)) < 0) + if ((fd = open(path, O_RDONLY|O_EXCL)) < 0) return (B_FALSE); if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || @@ -1034,25 +1115,27 @@ check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, nvlist_t **child; uint_t c, children; char *type, *path; - int ret; + int ret = 0; char buf[MAXPATHLEN]; - uint64_t wholedisk; + uint64_t wholedisk = B_FALSE; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); + if (strcmp(type, VDEV_TYPE_DISK) == 0) + verify(!nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); /* * As a generic check, we look to see if this is a replace of a * hot spare within the same pool. If so, we allow it - * regardless of what libdiskmgt or zpool_in_use() says. + * regardless of what libblkid or zpool_in_use() says. */ if (replacing) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) == 0 && wholedisk) + if (wholedisk) (void) snprintf(buf, sizeof (buf), "%ss0", path); else @@ -1063,7 +1146,7 @@ check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, } if (strcmp(type, VDEV_TYPE_DISK) == 0) - ret = check_device(path, force, isspare); + ret = check_device(path, force, isspare, wholedisk); if (strcmp(type, VDEV_TYPE_FILE) == 0) ret = check_file(path, force, isspare); diff --git a/lib/libefi/include/sys/uuid.h b/lib/libefi/include/sys/uuid.h index 9ce872e34..eab4622a6 100644 --- a/lib/libefi/include/sys/uuid.h +++ b/lib/libefi/include/sys/uuid.h @@ -74,12 +74,8 @@ struct uuid { uint8_t node_addr[6]; }; -#define UUID_LEN 16 - #define UUID_PRINTABLE_STRING_LENGTH 37 -typedef uchar_t uuid_t[UUID_LEN]; - /* * Convert a uuid to/from little-endian format */ diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index e682b840a..da71e3486 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,9 @@ #include #include #include -#include +#if defined(__linux__) +#include +#endif static struct uuid_to_ptag { struct uuid uuid; @@ -49,11 +52,11 @@ static struct uuid_to_ptag { { EFI_SWAP }, { EFI_USR }, { EFI_BACKUP }, - { 0 }, /* STAND is never used */ + { EFI_UNUSED }, /* STAND is never used */ { EFI_VAR }, { EFI_HOME }, { EFI_ALTSCTR }, - { 0 }, /* CACHE (cachefs) is never used */ + { EFI_UNUSED }, /* CACHE (cachefs) is never used */ { EFI_RESERVED }, { EFI_SYSTEM }, { EFI_LEGACY_MBR }, @@ -107,21 +110,144 @@ int efi_debug = 1; int efi_debug = 0; #endif -extern unsigned int efi_crc32(const unsigned char *, unsigned int); -static int efi_read(int, struct dk_gpt *); +static int efi_read(int, struct dk_gpt *); + +/* + * Return a 32-bit CRC of the contents of the buffer. Pre-and-post + * one's conditioning will be handled by crc32() internally. + */ +static uint32_t +efi_crc32(const unsigned char *buf, unsigned int size) +{ + uint32_t crc = crc32(0, Z_NULL, 0); + + crc = crc32(crc, buf, size); + + return (crc); +} static int read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) { - struct dk_minfo disk_info; + int sector_size; + unsigned long long capacity_size; + + if (ioctl(fd, BLKSSZGET, §or_size) < 0) + return (-1); + + if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0) + return (-1); + + *lbsize = (uint_t)sector_size; + *capacity = (diskaddr_t)(capacity_size / sector_size); - if ((ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info)) == -1) - return (errno); - *capacity = disk_info.dki_capacity; - *lbsize = disk_info.dki_lbsize; return (0); } +static int +efi_get_info(int fd, struct dk_cinfo *dki_info) +{ +#if defined(__linux__) + char *path; + char *dev_path; + int rval = 0; + + memset(dki_info, 0, sizeof(*dki_info)); + + path = calloc(PATH_MAX, 1); + if (path == NULL) + goto error; + + /* + * The simplest way to get the partition number under linux is + * to parse it out of the /dev/ block device name. + * The kernel creates this using the partition number when it + * populates /dev/ so it may be trusted. The tricky bit here is + * that the naming convention is based on the block device type. + * So we need to take this in to account when parsing out the + * partition information. Another issue is that the libefi API + * API only provides the open fd and not the file path. To handle + * this realpath(3) is used to resolve the block device name from + * /proc/self/fd/. Aside from the partition number we collect + * some additional device info. + */ + (void) sprintf(path, "/proc/self/fd/%d", fd); + dev_path = realpath(path, NULL); + free(path); + + if (dev_path == NULL) + goto error; + + if ((strncmp(dev_path, "/dev/sd", 7) == 0)) { + strcpy(dki_info->dki_cname, "sd"); + dki_info->dki_ctype = DKC_SCSI_CCS; + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) { + strcpy(dki_info->dki_cname, "hd"); + dki_info->dki_ctype = DKC_DIRECT; + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/md", 7) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_MD; + rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_VBD; + rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9-]p%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_PCMCIA_MEM; + rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_VBD; + rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else { + strcpy(dki_info->dki_dname, "unknown"); + strcpy(dki_info->dki_cname, "unknown"); + dki_info->dki_ctype = DKC_UNKNOWN; + } + + switch (rval) { + case 0: + errno = EINVAL; + goto error; + case 1: + dki_info->dki_partition = 0; + } + + free(dev_path); +#else + if (ioctl(fd, DKIOCINFO, (caddr_t)dki_info) == -1) + goto error; +#endif + return (0); +error: + if (efi_debug) + (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); + + switch (errno) { + case EIO: + return (VT_EIO); + case EINVAL: + return (VT_EINVAL); + default: + return (VT_ERROR); + } +} + /* * the number of blocks the EFI label takes up (round up to nearest * block) @@ -135,12 +261,13 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) int efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) { - diskaddr_t capacity; - uint_t lbsize; + diskaddr_t capacity = 0; + uint_t lbsize = 0; uint_t nblocks; size_t length; struct dk_gpt *vptr; struct uuid uuid; + struct dk_cinfo dki_info; if (read_disk_info(fd, &capacity, &lbsize) != 0) { if (efi_debug) @@ -148,6 +275,22 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) "couldn't read disk information\n"); return (-1); } +#if defined(__linux__) + if (efi_get_info(fd, &dki_info) != 0) { + if (efi_debug) + (void) fprintf(stderr, + "couldn't read disk information\n"); + return (-1); + } + + if (dki_info.dki_partition != 0) + return (-1); + + if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) || + (dki_info.dki_ctype == DKC_VBD) || + (dki_info.dki_ctype == DKC_UNKNOWN)) + return (-1); +#endif nblocks = NBLOCKS(nparts, lbsize); if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) { @@ -243,14 +386,138 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) { void *data = dk_ioc->dki_data; int error; +#if defined(__linux__) + diskaddr_t capacity; + uint_t lbsize; + /* + * When the IO is not being performed in kernel as an ioctl we need + * to know the sector size so we can seek to the proper byte offset. + */ + if (read_disk_info(fd, &capacity, &lbsize) == -1) { + if (efi_debug) + fprintf(stderr,"unable to read disk info: %d",errno); + + errno = EIO; + return -1; + } + + switch (cmd) { + case DKIOCGETEFI: + if (lbsize == 0) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI assuming " + "LBA %d bytes\n", DEV_BSIZE); + + lbsize = DEV_BSIZE; + } + + error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI lseek " + "error: %d\n", errno); + return error; + } + + error = read(fd, data, dk_ioc->dki_length); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI read " + "error: %d\n", errno); + return error; + } + + if (error != dk_ioc->dki_length) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI short " + "read of %d bytes\n", error); + errno = EIO; + return -1; + } + error = 0; + break; + + case DKIOCSETEFI: + if (lbsize == 0) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI unknown " + "LBA size\n"); + errno = EIO; + return -1; + } + + error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI lseek " + "error: %d\n", errno); + return error; + } + + error = write(fd, data, dk_ioc->dki_length); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI write " + "error: %d\n", errno); + return error; + } + + if (error != dk_ioc->dki_length) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI short " + "write of %d bytes\n", error); + errno = EIO; + return -1; + } + + /* Sync the new EFI table to disk */ + error = fsync(fd); + if (error == -1) + return error; + + /* Ensure any local disk cache is also flushed */ + if (ioctl(fd, BLKFLSBUF, 0) == -1) + return error; + + error = 0; + break; + + default: + if (efi_debug) + (void) fprintf(stderr, "unsupported ioctl()\n"); + + errno = EIO; + return -1; + } +#else dk_ioc->dki_data_64 = (uint64_t)(uintptr_t)data; error = ioctl(fd, cmd, (void *)dk_ioc); dk_ioc->dki_data = data; - +#endif return (error); } +#if defined(__linux__) +static int +efi_rescan(int fd) +{ + int retry = 5; + int error; + + /* Notify the kernel a devices partition table has been updated */ + while ((error = ioctl(fd, BLKRRPART)) != 0) { + if (--retry == 0) { + (void) fprintf(stderr, "the kernel failed to rescan " + "the partition table: %d\n", errno); + return (-1); + } + } + + return (0); +} +#endif + static int check_label(int fd, dk_efi_t *dk_ioc) { @@ -305,6 +572,8 @@ efi_read(int fd, struct dk_gpt *vtoc) int rval = 0; int md_flag = 0; int vdc_flag = 0; + diskaddr_t capacity = 0; + uint_t lbsize = 0; struct dk_minfo disk_info; dk_efi_t dk_ioc; efi_gpt_t *efi; @@ -316,19 +585,9 @@ efi_read(int fd, struct dk_gpt *vtoc) /* * get the partition number for this file descriptor. */ - if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) { - if (efi_debug) { - (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); - } - switch (errno) { - case EIO: - return (VT_EIO); - case EINVAL: - return (VT_EINVAL); - default: - return (VT_ERROR); - } - } + if ((rval = efi_get_info(fd, &dki_info)) != 0) + return rval; + if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && (strncmp(dki_info.dki_dname, "md", 3) == 0)) { md_flag++; @@ -342,14 +601,18 @@ efi_read(int fd, struct dk_gpt *vtoc) } /* get the LBA size */ - if (ioctl(fd, DKIOCGMEDIAINFO, (caddr_t)&disk_info) == -1) { + if (read_disk_info(fd, &capacity, &lbsize) == -1) { if (efi_debug) { (void) fprintf(stderr, - "assuming LBA 512 bytes %d\n", - errno); + "unable to read disk info: %d", + errno); } - disk_info.dki_lbsize = DEV_BSIZE; + return (VT_EINVAL); } + + disk_info.dki_lbsize = lbsize; + disk_info.dki_capacity = capacity; + if (disk_info.dki_lbsize == 0) { if (efi_debug) { (void) fprintf(stderr, @@ -374,9 +637,11 @@ efi_read(int fd, struct dk_gpt *vtoc) } } - if ((dk_ioc.dki_data = calloc(label_len, 1)) == NULL) + if (posix_memalign((void **)&dk_ioc.dki_data, + disk_info.dki_lbsize, label_len)) return (VT_ERROR); + memset(dk_ioc.dki_data, 0, label_len); dk_ioc.dki_length = disk_info.dki_lbsize; user_length = vtoc->efi_nparts; efi = dk_ioc.dki_data; @@ -572,12 +837,14 @@ write_pmbr(int fd, struct dk_gpt *vtoc) int len; len = (vtoc->efi_lbasize == 0) ? sizeof (mb) : vtoc->efi_lbasize; - buf = calloc(len, 1); + if (posix_memalign((void **)&buf, len, len)) + return (VT_ERROR); /* * Preserve any boot code and disk signature if the first block is * already an MBR. */ + memset(buf, 0, len); dk_ioc.dki_lba = 0; dk_ioc.dki_length = len; /* LINTED -- always longlong aligned */ @@ -663,10 +930,9 @@ check_input(struct dk_gpt *vtoc) if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) && (vtoc->efi_parts[i].p_size != 0)) { if (efi_debug) { - (void) fprintf(stderr, -"partition %d is \"unassigned\" but has a size of %llu", - i, - vtoc->efi_parts[i].p_size); + (void) fprintf(stderr, "partition %d is " + "\"unassigned\" but has a size of %llu", + i, vtoc->efi_parts[i].p_size); } return (VT_EINVAL); } @@ -679,9 +945,9 @@ check_input(struct dk_gpt *vtoc) if (vtoc->efi_parts[i].p_tag == V_RESERVED) { if (resv_part != -1) { if (efi_debug) { - (void) fprintf(stderr, -"found duplicate reserved partition at %d\n", - i); + (void) fprintf(stderr, "found " + "duplicate reserved partition " + "at %d\n", i); } return (VT_EINVAL); } @@ -732,8 +998,8 @@ check_input(struct dk_gpt *vtoc) (istart <= endsect)) { if (efi_debug) { (void) fprintf(stderr, -"Partition %d overlaps partition %d.", - i, j); + "Partition %d overlaps " + "partition %d.", i, j); } return (VT_EINVAL); } @@ -839,22 +1105,13 @@ efi_write(int fd, struct dk_gpt *vtoc) efi_gpe_t *efi_parts; int i, j; struct dk_cinfo dki_info; + int rval; int md_flag = 0; int nblocks; diskaddr_t lba_backup_gpt_hdr; - if (ioctl(fd, DKIOCINFO, (caddr_t)&dki_info) == -1) { - if (efi_debug) - (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); - switch (errno) { - case EIO: - return (VT_EIO); - case EINVAL: - return (VT_EINVAL); - default: - return (VT_ERROR); - } - } + if ((rval = efi_get_info(fd, &dki_info)) != 0) + return rval; /* check if we are dealing wih a metadevice */ if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && @@ -892,9 +1149,11 @@ efi_write(int fd, struct dk_gpt *vtoc) * for backup GPT header. */ lba_backup_gpt_hdr = vtoc->efi_last_u_lba + 1 + nblocks; - if ((dk_ioc.dki_data = calloc(dk_ioc.dki_length, 1)) == NULL) + if (posix_memalign((void **)&dk_ioc.dki_data, + vtoc->efi_lbasize, dk_ioc.dki_length)) return (VT_ERROR); + memset(dk_ioc.dki_data, 0, dk_ioc.dki_length); efi = dk_ioc.dki_data; /* stuff user's input into EFI struct */ @@ -941,6 +1200,10 @@ efi_write(int fd, struct dk_gpt *vtoc) return (VT_EINVAL); } + /* Zero's should be written for empty partitions */ + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) + continue; + efi_parts[i].efi_gpe_StartingLBA = LE_64(vtoc->efi_parts[i].p_start); efi_parts[i].efi_gpe_EndingLBA = @@ -1032,6 +1295,13 @@ efi_write(int fd, struct dk_gpt *vtoc) /* write the PMBR */ (void) write_pmbr(fd, vtoc); free(dk_ioc.dki_data); + +#if defined(__linux__) + rval = efi_rescan(fd); + if (rval) + return (VT_ERROR); +#endif + return (0); } @@ -1049,6 +1319,7 @@ efi_free(struct dk_gpt *ptr) int efi_type(int fd) { +#if 0 struct vtoc vtoc; struct extvtoc extvtoc; @@ -1062,6 +1333,9 @@ efi_type(int fd) } } return (0); +#else + return (ENOSYS); +#endif } void @@ -1175,7 +1449,7 @@ efi_auto_sense(int fd, struct dk_gpt **vtoc) return (-1); } - for (i = 0; i < min((*vtoc)->efi_nparts, V_NUMPAR); i++) { + for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) { (*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag; (*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag; (*vtoc)->efi_parts[i].p_start = 0; diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h index 197e2eefc..dcbd283ac 100644 --- a/lib/libzfs/include/libzfs.h +++ b/lib/libzfs/include/libzfs.h @@ -48,6 +48,26 @@ extern "C" { #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN +/* + * Default device paths + */ + +#if defined(__sun__) || defined(__sun) +#define DISK_ROOT "/dev/dsk" +#define RDISK_ROOT "/dev/rdsk" +#define UDISK_ROOT RDISK_ROOT +#define FIRST_SLICE "s0" +#define BACKUP_SLICE "s2" +#endif + +#ifdef __linux__ +#define DISK_ROOT "/dev" +#define RDISK_ROOT DISK_ROOT +#define UDISK_ROOT "/dev/disk" +#define FIRST_SLICE "1" +#define BACKUP_SLICE "" +#endif + /* * libzfs errors */ @@ -248,6 +268,7 @@ extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); +extern int zpool_label_disk_wait(char *, int); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); /* @@ -661,9 +682,6 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, extern int zpool_read_label(int, nvlist_t **); extern int zpool_clear_label(int); -/* is this zvol valid for use as a dump device? */ -extern int zvol_check_dump_config(char *); - /* * Management interfaces for SMB ACL files */ diff --git a/lib/libzfs/include/libzfs_impl.h b/lib/libzfs/include/libzfs_impl.h index 3d001df07..2389b7823 100644 --- a/lib/libzfs/include/libzfs_impl.h +++ b/lib/libzfs/include/libzfs_impl.h @@ -191,6 +191,8 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); +int zvol_create_link(libzfs_handle_t *, const char *); +int zvol_remove_link(libzfs_handle_t *, const char *); boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 0bcfc0423..6f067d563 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -93,6 +93,7 @@ struct prop_changelist { int changelist_prefix(prop_changelist_t *clp) { +#ifdef HAVE_ZPL prop_changenode_t *cn; int ret = 0; @@ -141,6 +142,9 @@ changelist_prefix(prop_changelist_t *clp) (void) changelist_postfix(clp); return (ret); +#else + return 0; +#endif /* HAVE_ZPL */ } /* @@ -155,6 +159,7 @@ changelist_prefix(prop_changelist_t *clp) int changelist_postfix(prop_changelist_t *clp) { +#ifdef HAVE_ZPL prop_changenode_t *cn; char shareopts[ZFS_MAXPROPLEN]; int errors = 0; @@ -255,6 +260,9 @@ changelist_postfix(prop_changelist_t *clp) } return (errors ? -1 : 0); +#else + return 0; +#endif /* HAVE_ZPL */ } /* @@ -317,6 +325,7 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) int changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) { +#ifdef HAVE_ZPL prop_changenode_t *cn; int ret = 0; @@ -331,6 +340,9 @@ changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) } return (ret); +#else + return 0; +#endif } /* diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index baf289b64..d876e5d1f 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -57,6 +57,7 @@ #include "libzfs_impl.h" #include "zfs_deleg.h" +static int zvol_create_link_common(libzfs_handle_t *, const char *, int); static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); @@ -994,6 +995,7 @@ badlabel: /*FALLTHRU*/ +#ifdef HAVE_ZPL case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* @@ -1104,6 +1106,7 @@ badlabel: } break; +#endif /* HAVE_ZPL */ case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; @@ -2742,6 +2745,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) goto ancestorerr; } +#ifdef HAVE_ZPL if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; @@ -2751,6 +2755,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } +#endif /* HAVE_ZPL */ zfs_close(h); } @@ -2887,6 +2892,18 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, /* create the dataset */ ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); + if (ret == 0 && type == ZFS_TYPE_VOLUME) { + ret = zvol_create_link(hdl, path); + if (ret) { + (void) zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, + "Volume successfully created, but device links " + "were not created")); + zcmd_free_nvlists(&zc); + return (-1); + } + } + zcmd_free_nvlists(&zc); /* check for failure */ @@ -2949,6 +2966,9 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (ZFS_IS_VOLUME(zhp)) { + if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) + return (-1); + zc.zc_objset_type = DMU_OST_ZVOL; } else { zc.zc_objset_type = DMU_OST_ZFS; @@ -2991,9 +3011,17 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) zfs_close(szhp); } + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + (void) zvol_remove_link(zhp->zfs_hdl, name); + /* + * NB: this is simply a best-effort. We don't want to + * return an error, because then we wouldn't visit all + * the volumes. + */ + } + dd->closezhp = B_TRUE; - if (!dd->gotone) - rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); if (closezhp) zfs_close(zhp); return (rv); @@ -3128,11 +3156,70 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } + } else if (ZFS_IS_VOLUME(zhp)) { + ret = zvol_create_link(zhp->zfs_hdl, target); } return (ret); } +typedef struct promote_data { + char cb_mountpoint[MAXPATHLEN]; + const char *cb_target; + const char *cb_errbuf; + uint64_t cb_pivot_txg; +} promote_data_t; + +static int +promote_snap_cb(zfs_handle_t *zhp, void *data) +{ + promote_data_t *pd = data; + zfs_handle_t *szhp; + char snapname[MAXPATHLEN]; + int rv = 0; + + /* We don't care about snapshots after the pivot point */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) { + zfs_close(zhp); + return (0); + } + + /* Remove the device link if it's a zvol. */ + if (ZFS_IS_VOLUME(zhp)) + (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name); + + /* Check for conflicting names */ + (void) strlcpy(snapname, pd->cb_target, sizeof (snapname)); + (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname)); + szhp = make_dataset_handle(zhp->zfs_hdl, snapname); + if (szhp != NULL) { + zfs_close(szhp); + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "snapshot name '%s' from origin \n" + "conflicts with '%s' from target"), + zhp->zfs_name, snapname); + rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf); + } + zfs_close(zhp); + return (rv); +} + +static int +promote_snap_done_cb(zfs_handle_t *zhp, void *data) +{ + promote_data_t *pd = data; + + /* We don't care about snapshots after the pivot point */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) { + /* Create the device link if it's a zvol. */ + if (ZFS_IS_VOLUME(zhp)) + (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name); + } + + zfs_close(zhp); + return (0); +} + /* * Promotes the given clone fs to be the clone parent. */ @@ -3142,7 +3229,10 @@ zfs_promote(zfs_handle_t *zhp) libzfs_handle_t *hdl = zhp->zfs_hdl; zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; char parent[MAXPATHLEN]; + char *cp; int ret; + zfs_handle_t *pzhp; + promote_data_t pd; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, @@ -3160,7 +3250,29 @@ zfs_promote(zfs_handle_t *zhp) "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } + cp = strchr(parent, '@'); + *cp = '\0'; + /* Walk the snapshots we will be moving */ + pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); + if (pzhp == NULL) + return (-1); + pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG); + zfs_close(pzhp); + pd.cb_target = zhp->zfs_name; + pd.cb_errbuf = errbuf; + pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET); + if (pzhp == NULL) + return (-1); + (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint, + sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE); + ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd); + if (ret != 0) { + zfs_close(pzhp); + return (-1); + } + + /* issue the ioctl */ (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); @@ -3169,9 +3281,16 @@ zfs_promote(zfs_handle_t *zhp) if (ret != 0) { int save_errno = errno; + (void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd); + zfs_close(pzhp); + switch (save_errno) { case EEXIST: - /* There is a conflicting snapshot name. */ + /* + * There is a conflicting snapshot name. We + * should have caught this above, but they could + * have renamed something in the mean time. + */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), zc.zc_string, parent); @@ -3180,7 +3299,44 @@ zfs_promote(zfs_handle_t *zhp) default: return (zfs_standard_error(hdl, save_errno, errbuf)); } + } else { + (void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd); } + + zfs_close(pzhp); + return (ret); +} + +struct createdata { + const char *cd_snapname; + int cd_ifexists; +}; + +static int +zfs_create_link_cb(zfs_handle_t *zhp, void *arg) +{ + struct createdata *cd = arg; + int ret; + + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + char name[MAXPATHLEN]; + + (void) strlcpy(name, zhp->zfs_name, sizeof (name)); + (void) strlcat(name, "@", sizeof (name)); + (void) strlcat(name, cd->cd_snapname, sizeof (name)); + (void) zvol_create_link_common(zhp->zfs_hdl, name, + cd->cd_ifexists); + /* + * NB: this is simply a best-effort. We don't want to + * return an error, because then we wouldn't visit all + * the volumes. + */ + } + + ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd); + + zfs_close(zhp); + return (ret); } @@ -3244,11 +3400,31 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, * if it was recursive, the one that actually failed will be in * zc.zc_name. */ - if (ret != 0) { + if (ret != 0) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); - (void) zfs_standard_error(hdl, errno, errbuf); + + if (ret == 0 && recursive) { + struct createdata cd; + + cd.cd_snapname = delim + 1; + cd.cd_ifexists = B_FALSE; + (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd); } + if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) { + ret = zvol_create_link(zhp->zfs_hdl, path); + if (ret != 0) { + (void) zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, + "Volume successfully snapshotted, but device links " + "were not created")); + zfs_close(zhp); + return (-1); + } + } + + if (ret != 0) + (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(zhp); @@ -3351,6 +3527,8 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) + return (-1); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); @@ -3388,6 +3566,10 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { + if ((err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name))) { + zfs_close(zhp); + return (err); + } if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) @@ -3536,6 +3718,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) } if (recursive) { + struct destroydata dd; parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { @@ -3550,6 +3733,15 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) goto error; } + dd.snapname = delim + 1; + dd.gotone = B_FALSE; + dd.closezhp = B_TRUE; + + /* We remove any zvol links prior to renaming them */ + ret = zfs_iter_filesystems(zhrp, zfs_check_snap_cb, &dd); + if (ret) { + goto error; + } } else { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL) return (-1); @@ -3598,10 +3790,27 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ - if (!recursive) + if (recursive) { + struct createdata cd; + + /* only create links for datasets that had existed */ + cd.cd_snapname = delim + 1; + cd.cd_ifexists = B_TRUE; + (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb, + &cd); + } else { (void) changelist_postfix(cl); + } } else { - if (!recursive) { + if (recursive) { + struct createdata cd; + + /* only create links for datasets that had existed */ + cd.cd_snapname = strchr(target, '@') + 1; + cd.cd_ifexists = B_TRUE; + ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb, + &cd); + } else { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } @@ -3620,21 +3829,105 @@ error: return (ret); } +/* + * Given a zvol dataset, issue the ioctl to create the appropriate minor node, + * and wait briefly for udev to create the /dev link. + */ +int +zvol_create_link(libzfs_handle_t *hdl, const char *dataset) +{ + return (zvol_create_link_common(hdl, dataset, B_FALSE)); +} + +static int +zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) +{ + zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; + char path[MAXPATHLEN]; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + + /* + * Issue the appropriate ioctl. + */ + if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) { + switch (errno) { + case EEXIST: + /* + * Silently ignore the case where the link already + * exists. This allows 'zfs volinit' to be run multiple + * times without errors. + */ + return (0); + + case ENOENT: + /* + * Dataset does not exist in the kernel. If we + * don't care (see zfs_rename), then ignore the + * error quietly. + */ + if (ifexists) { + return (0); + } + + /* FALLTHROUGH */ + + default: + return (zfs_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot create device links " + "for '%s'"), dataset)); + } + } + + /* + * Wait up to 10 seconds for udev to create the device. + */ + (void) snprintf(path, sizeof (path), "%s/%s", ZVOL_DIR, dataset); + error = zpool_label_disk_wait(path, 10000); + if (error) + (void) printf(gettext("%s may not be immediately " + "available\n"), path); + + return (0); +} + +/* + * Remove a minor node for the given zvol and the associated /dev links. + */ +int +zvol_remove_link(libzfs_handle_t *hdl, const char *dataset) +{ + zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) { + switch (errno) { + case ENXIO: + /* + * Silently ignore the case where the link no longer + * exists, so that 'zfs volfini' can be run multiple + * times without errors. + */ + return (0); + + default: + return (zfs_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot remove device " + "links for '%s'"), dataset)); + } + } + + return (0); +} + nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } -nvlist_t * -zfs_get_recvd_props(zfs_handle_t *zhp) -{ - if (zhp->zfs_recvd_props == NULL) - if (get_recvd_props_ioctl(zhp) != 0) - return (NULL); - return (zhp->zfs_recvd_props); -} - /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: @@ -3744,6 +4037,7 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) return (0); } +#ifdef HAVE_ZPL int zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, char *resource, void *export, void *sharetab, @@ -3763,6 +4057,7 @@ zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc); return (error); } +#endif /* HAVE_ZPL */ void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 386ab002f..ee0064892 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -52,9 +52,11 @@ #include #include #include -#include #include +#ifdef HAVE_LIBBLKID +#include +#endif #include "libzfs.h" #include "libzfs_impl.h" @@ -904,211 +906,76 @@ zpool_read_label(int fd, nvlist_t **config) return (0); } -typedef struct rdsk_node { - char *rn_name; - int rn_dfd; - libzfs_handle_t *rn_hdl; - nvlist_t *rn_config; - avl_tree_t *rn_avl; - avl_node_t rn_node; - boolean_t rn_nozpool; -} rdsk_node_t; - -static int -slice_cache_compare(const void *arg1, const void *arg2) -{ - const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; - const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; - char *nm1slice, *nm2slice; - int rv; - - /* - * slices zero and two are the most likely to provide results, - * so put those first - */ - nm1slice = strstr(nm1, "s0"); - nm2slice = strstr(nm2, "s0"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } - nm1slice = strstr(nm1, "s2"); - nm2slice = strstr(nm2, "s2"); - if (nm1slice && !nm2slice) { - return (-1); - } - if (!nm1slice && nm2slice) { - return (1); - } - - rv = strcmp(nm1, nm2); - if (rv == 0) - return (0); - return (rv > 0 ? 1 : -1); -} - -static void -check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, - diskaddr_t size, uint_t blksz) -{ - rdsk_node_t tmpnode; - rdsk_node_t *node; - char sname[MAXNAMELEN]; - - tmpnode.rn_name = &sname[0]; - (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", - diskname, partno); - /* - * protect against division by zero for disk labels that - * contain a bogus sector size - */ - if (blksz == 0) - blksz = DEV_BSIZE; - /* too small to contain a zpool? */ - if ((size < (SPA_MINDEVSIZE / blksz)) && - (node = avl_find(r, &tmpnode, NULL))) - node->rn_nozpool = B_TRUE; -} - -static void -nozpool_all_slices(avl_tree_t *r, const char *sname) -{ - char diskname[MAXNAMELEN]; - char *ptr; - int i; - - (void) strncpy(diskname, sname, MAXNAMELEN); - if (((ptr = strrchr(diskname, 's')) == NULL) && - ((ptr = strrchr(diskname, 'p')) == NULL)) - return; - ptr[0] = 's'; - ptr[1] = '\0'; - for (i = 0; i < NDKMAP; i++) - check_one_slice(r, diskname, i, 0, 1); - ptr[0] = 'p'; - for (i = 0; i <= FD_NUMPART; i++) - check_one_slice(r, diskname, i, 0, 1); -} - -static void -check_slices(avl_tree_t *r, int fd, const char *sname) -{ - struct extvtoc vtoc; - struct dk_gpt *gpt; - char diskname[MAXNAMELEN]; - char *ptr; - int i; - - (void) strncpy(diskname, sname, MAXNAMELEN); - if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) - return; - ptr[1] = '\0'; - - if (read_extvtoc(fd, &vtoc) >= 0) { - for (i = 0; i < NDKMAP; i++) - check_one_slice(r, diskname, i, - vtoc.v_part[i].p_size, vtoc.v_sectorsz); - } else if (efi_alloc_and_read(fd, &gpt) >= 0) { - /* - * on x86 we'll still have leftover links that point - * to slices s[9-15], so use NDKMAP instead - */ - for (i = 0; i < NDKMAP; i++) - check_one_slice(r, diskname, i, - gpt->efi_parts[i].p_size, gpt->efi_lbasize); - /* nodes p[1-4] are never used with EFI labels */ - ptr[0] = 'p'; - for (i = 1; i <= FD_NUMPART; i++) - check_one_slice(r, diskname, i, 0, 1); - efi_free(gpt); - } -} - -static void -zpool_open_func(void *arg) -{ - rdsk_node_t *rn = arg; - struct stat64 statbuf; - nvlist_t *config; - int fd; - - if (rn->rn_nozpool) - return; - if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { - /* symlink to a device that's no longer there */ - if (errno == ENOENT) - nozpool_all_slices(rn->rn_avl, rn->rn_name); - return; - } - /* - * Ignore failed stats. We only want regular - * files, character devs and block devs. - */ - if (fstat64(fd, &statbuf) != 0 || - (!S_ISREG(statbuf.st_mode) && - !S_ISCHR(statbuf.st_mode) && - !S_ISBLK(statbuf.st_mode))) { - (void) close(fd); - return; - } - /* this file is too small to hold a zpool */ - if (S_ISREG(statbuf.st_mode) && - statbuf.st_size < SPA_MINDEVSIZE) { - (void) close(fd); - return; - } else if (!S_ISREG(statbuf.st_mode)) { - /* - * Try to read the disk label first so we don't have to - * open a bunch of minor nodes that can't have a zpool. - */ - check_slices(rn->rn_avl, fd, rn->rn_name); - } - - if ((zpool_read_label(fd, &config)) != 0) { - (void) close(fd); - (void) no_memory(rn->rn_hdl); - return; - } - (void) close(fd); - - - rn->rn_config = config; - if (config != NULL) { - assert(rn->rn_nozpool == B_FALSE); - } -} - +#ifdef HAVE_LIBBLKID /* - * Given a file descriptor, clear (zero) the label information. This function - * is currently only used in the appliance stack as part of the ZFS sysevent - * module. + * Use libblkid to quickly search for zfs devices */ -int -zpool_clear_label(int fd) +static int +zpool_find_import_blkid(libzfs_handle_t *hdl, pool_list_t *pools) { - struct stat64 statbuf; - int l; - vdev_label_t *label; - uint64_t size; + blkid_cache cache; + blkid_dev_iterate iter; + blkid_dev dev; + const char *devname; + nvlist_t *config; + int fd, err; - if (fstat64(fd, &statbuf) == -1) - return (0); - size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); - - if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL) - return (-1); - - for (l = 0; l < VDEV_LABELS; l++) { - if (pwrite64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) - return (-1); + err = blkid_get_cache(&cache, NULL); + if (err != 0) { + (void) zfs_error_fmt(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "blkid_get_cache() %d"), err); + goto err_blkid1; } - free(label); - return (0); + err = blkid_probe_all(cache); + if (err != 0) { + (void) zfs_error_fmt(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "blkid_probe_all() %d"), err); + goto err_blkid2; + } + + iter = blkid_dev_iterate_begin(cache); + if (iter == NULL) { + (void) zfs_error_fmt(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "blkid_dev_iterate_begin()")); + goto err_blkid2; + } + + err = blkid_dev_set_search(iter, "TYPE", "zfs"); + if (err != 0) { + (void) zfs_error_fmt(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "blkid_dev_set_search() %d"), err); + goto err_blkid3; + } + + while (blkid_dev_next(iter, &dev) == 0) { + devname = blkid_dev_devname(dev); + if ((fd = open64(devname, O_RDONLY)) < 0) + continue; + + err = zpool_read_label(fd, &config); + (void) close(fd); + + if (err != 0) { + (void) no_memory(hdl); + goto err_blkid3; + } + + if (config != NULL) { + err = add_config(hdl, pools, devname, config); + if (err != 0) + goto err_blkid3; + } + } + +err_blkid3: + blkid_dev_iterate_end(iter); +err_blkid2: + blkid_put_cache(cache); +err_blkid1: + return err; } +#endif /* HAVE_LIBBLKID */ /* * Given a list of directories to search, find all pools stored on disk. This @@ -1126,18 +993,28 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) char path[MAXPATHLEN]; char *end, **dir = iarg->path; size_t pathleft; - nvlist_t *ret = NULL; - static char *default_dir = "/dev/dsk"; + struct stat64 statbuf; + nvlist_t *ret = NULL, *config; + static char *default_dir = DISK_ROOT; + int fd; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; - avl_tree_t slice_cache; - rdsk_node_t *slice; - void *cookie; + + verify(iarg->poolname == NULL || iarg->guid == 0); if (dirs == 0) { +#ifdef HAVE_LIBBLKID + /* Use libblkid to scan all device for their type */ + if (zpool_find_import_blkid(hdl, &pools) == 0) + goto skip_scanning; + + (void) zfs_error_fmt(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "blkid failure falling back " + "to manual probing")); +#endif /* HAVE_LIBBLKID */ dirs = 1; dir = &default_dir; } @@ -1148,7 +1025,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) * and toplevel GUID. */ for (i = 0; i < dirs; i++) { - tpool_t *t; char *rdsk; int dfd; @@ -1182,8 +1058,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) goto error; } - avl_create(&slice_cache, slice_cache_compare, - sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); /* * This is not MT-safe, but we have no MT consumers of libzfs */ @@ -1193,37 +1067,51 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; - slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); - slice->rn_name = zfs_strdup(hdl, name); - slice->rn_avl = &slice_cache; - slice->rn_dfd = dfd; - slice->rn_hdl = hdl; - slice->rn_nozpool = B_FALSE; - avl_add(&slice_cache, slice); - } - /* - * create a thread pool to do all of this in parallel; - * rn_nozpool is not protected, so this is racy in that - * multiple tasks could decide that the same slice can - * not hold a zpool, which is benign. Also choose - * double the number of processors; we hold a lot of - * locks in the kernel, so going beyond this doesn't - * buy us much. - */ - t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), - 0, NULL); - for (slice = avl_first(&slice_cache); slice; - (slice = avl_walk(&slice_cache, slice, - AVL_AFTER))) - (void) tpool_dispatch(t, zpool_open_func, slice); - tpool_wait(t); - tpool_destroy(t); + /* + * Skip checking devices with well known prefixes: + * watchdog - A special close is required to avoid + * triggering it and resetting the system. + * fuse - Fuse control device. + * ppp - Generic PPP driver. + * tty* - Generic serial interface. + * vcs* - Virtual console memory. + * parport* - Parallel port interface. + * lp* - Printer interface. + * fd* - Floppy interface. + */ + if ((strncmp(name, "watchdog", 8) == 0) || + (strncmp(name, "fuse", 4) == 0) || + (strncmp(name, "ppp", 3) == 0) || + (strncmp(name, "tty", 3) == 0) || + (strncmp(name, "vcs", 3) == 0) || + (strncmp(name, "parport", 7) == 0) || + (strncmp(name, "lp", 2) == 0) || + (strncmp(name, "fd", 2) == 0)) + continue; - cookie = NULL; - while ((slice = avl_destroy_nodes(&slice_cache, - &cookie)) != NULL) { - if (slice->rn_config != NULL) { - nvlist_t *config = slice->rn_config; + if ((fd = openat64(dfd, name, O_RDONLY)) < 0) + continue; + + /* + * Ignore failed stats. We only want regular + * files and block devs. + */ + if (fstat64(fd, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && + !S_ISBLK(statbuf.st_mode))) { + (void) close(fd); + continue; + } + + if ((zpool_read_label(fd, &config)) != 0) { + (void) close(fd); + (void) no_memory(hdl); + goto error; + } + + (void) close(fd); + + if (config != NULL) { boolean_t matched = B_TRUE; if (iarg->poolname != NULL) { @@ -1247,19 +1135,19 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) continue; } /* use the non-raw path for the config */ - (void) strlcpy(end, slice->rn_name, pathleft); + (void) strlcpy(end, name, pathleft); if (add_config(hdl, &pools, path, config) != 0) goto error; } - free(slice->rn_name); - free(slice); } - avl_destroy(&slice_cache); (void) closedir(dirp); dirp = NULL; } +#ifdef HAVE_LIBBLKID +skip_scanning: +#endif ret = get_configs(hdl, &pools, iarg->can_be_active); error: diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index c31a12371..4b9038de8 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -81,6 +81,7 @@ #include #define MAXISALEN 257 /* based on sysinfo(2) man page */ +#ifdef HAVE_ZPL static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); @@ -1268,3 +1269,53 @@ out: return (ret); } + +#else /* HAVE_ZPL */ + +int +zfs_unshare_iscsi(zfs_handle_t *zhp) +{ + return 0; +} + +int +zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) +{ + return 0; +} + +void +remove_mountpoint(zfs_handle_t *zhp) { + return; +} + +boolean_t +is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) +{ + return B_FALSE; +} + +boolean_t +zfs_is_mounted(zfs_handle_t *zhp, char **where) +{ + return is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where); +} + +boolean_t +zfs_is_shared(zfs_handle_t *zhp) +{ + return B_FALSE; +} + +int +zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) +{ + return B_FALSE; +} + +int +zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) +{ + return B_FALSE; +} +#endif /* HAVE_ZPL */ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 42f303894..ec27b5756 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -44,10 +46,6 @@ static int read_efi_label(nvlist_t *config, diskaddr_t *sb); -#define DISK_ROOT "/dev/dsk" -#define RDISK_ROOT "/dev/rdsk" -#define BACKUP_SLICE "s2" - typedef struct prop_flags { int create:1; /* Validate property on creation */ int import:1; /* Validate property on import */ @@ -651,9 +649,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) /* * Don't start the slice at the default block of 34; many storage - * devices will use a stripe width of 128k, so start there instead. + * devices will use a stripe width of 128k, other vendors prefer a 1m + * alignment. It is best to play it safe and ensure a 1m alignment + * give 512b blocks. When the block size is larger by a power of 2 + * we will still be 1m aligned. */ -#define NEW_START_BLOCK 256 +#define NEW_START_BLOCK 2048 /* * Validate the given pool name, optionally putting an extended error message in @@ -948,10 +949,12 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a - * label. + * label. This can also happen under if the device is + * part of an active md or lvm device. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "one or more vdevs refer to the same device")); + "one or more vdevs refer to the same device, or one of\n" + "the devices is part of an active md or lvm device")); return (zfs_error(hdl, EZFS_BADDEV, msg)); case EOVERFLOW: @@ -1928,7 +1931,7 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, } else if (zpool_vdev_is_interior(path)) { verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); } else if (path[0] != '/') { - (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); + (void) snprintf(buf, sizeof (buf), "%s/%s", DISK_ROOT, path); verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); } else { verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); @@ -2101,22 +2104,14 @@ zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) * the disk to use the new unallocated space. */ static int -zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path) { - char path[MAXPATHLEN]; char errbuf[1024]; int fd, error; - int (*_efi_use_whole_disk)(int); - if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, - "efi_use_whole_disk")) == NULL) - return (-1); - - (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name); - - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "relabel '%s': unable to open device"), name); + "relabel '%s': unable to open device"), path); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } @@ -2125,11 +2120,11 @@ zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) * does not have any unallocated space left. If so, we simply * ignore that error and continue on. */ - error = _efi_use_whole_disk(fd); + error = efi_use_whole_disk(fd); (void) close(fd); if (error && error != VT_ENOSPC) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " - "relabel '%s': unable to read disk capacity"), name); + "relabel '%s': unable to read disk capacity"), path); return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } return (0); @@ -3071,7 +3066,7 @@ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, boolean_t verbose) { - char *path, *devid; + char *path, *devid, *type; uint64_t value; char buf[64]; vdev_stat_t *vs; @@ -3085,7 +3080,6 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { - /* * If the device is dead (faulted, offline, etc) then don't * bother opening it. Otherwise we may be forcing the user to @@ -3124,9 +3118,19 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, devid_str_free(newdevid); } - if (strncmp(path, "/dev/dsk/", 9) == 0) - path += 9; + /* + * For a block device only use the name. + */ + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + path = strrchr(path, '/'); + path++; + } +#if defined(__sun__) || defined(__sun) + /* + * The following code strips the slice from the device path. + */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value) { int pathlen = strlen(path); @@ -3148,6 +3152,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, } return (tmp); } +#endif } else { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); @@ -3629,7 +3634,7 @@ read_efi_label(nvlist_t *config, diskaddr_t *sb) (void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT, strrchr(path, '/')); - if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) { + if ((fd = open(diskname, O_RDWR|O_DIRECT)) >= 0) { struct dk_gpt *vtoc; if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { @@ -3675,6 +3680,54 @@ find_start_block(nvlist_t *config) return (MAXOFFSET_T); } +int +zpool_label_disk_wait(char *path, int timeout) +{ + struct stat64 statbuf; + int i; + + /* + * Wait timeout miliseconds for a newly created device to be available + * from the given path. There is a small window when a /dev/ device + * will exist and the udev link will not, so we must wait for the + * symlink. Depending on the udev rules this may take a few seconds. + */ + for (i = 0; i < timeout; i++) { + usleep(1000); + + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) + return (0); + } + + return (ENOENT); +} + +int +zpool_label_disk_check(char *path) +{ + struct dk_gpt *vtoc; + int fd, err; + + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) + return errno; + + if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { + (void) close(fd); + return err; + } + + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + return EIDRM; + } + + efi_free(vtoc); + (void) close(fd); + return 0; +} + /* * Label an individual disk. The name provided is the short name, * stripped of any leading /dev path. @@ -3684,7 +3737,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) { char path[MAXPATHLEN]; struct dk_gpt *vtoc; - int fd; + int rval, fd; size_t resv = EFI_MIN_RESV_SIZE; uint64_t slice_size; diskaddr_t start_block; @@ -3720,13 +3773,13 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, BACKUP_SLICE); - if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { /* * This shouldn't happen. We've long since verified that this * is a valid device. */ - zfs_error_aux(hdl, - dgettext(TEXT_DOMAIN, "unable to open device")); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unable to open device '%s': %d"), path, errno); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } @@ -3769,7 +3822,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) vtoc->efi_parts[8].p_size = resv; vtoc->efi_parts[8].p_tag = V_RESERVED; - if (efi_write(fd, vtoc) != 0) { + if ((rval = efi_write(fd, vtoc)) != 0) { /* * Some block drivers (like pcata) may not support EFI * GPT labels. Print out a helpful error message dir- @@ -3779,123 +3832,34 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) (void) close(fd); efi_free(vtoc); - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "try using fdisk(1M) and then provide a specific slice")); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " + "parted(8) and then provide a specific slice: %d"), rval); return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); } (void) close(fd); efi_free(vtoc); - return (0); -} - -static boolean_t -supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) -{ - char *type; - nvlist_t **child; - uint_t children, c; - - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(type, VDEV_TYPE_FILE) == 0 || - strcmp(type, VDEV_TYPE_LOG) == 0 || - strcmp(type, VDEV_TYPE_HOLE) == 0 || - strcmp(type, VDEV_TYPE_MISSING) == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "vdev type '%s' is not supported"), type); - (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf); - return (B_FALSE); - } - if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - if (!supported_dump_vdev_type(hdl, child[c], errbuf)) - return (B_FALSE); - } - } - return (B_TRUE); -} - -/* - * check if this zvol is allowable for use as a dump device; zero if - * it is, > 0 if it isn't, < 0 if it isn't a zvol - */ -int -zvol_check_dump_config(char *arg) -{ - zpool_handle_t *zhp = NULL; - nvlist_t *config, *nvroot; - char *p, *volname; - nvlist_t **top; - uint_t toplevels; - libzfs_handle_t *hdl; - char errbuf[1024]; - char poolname[ZPOOL_MAXNAMELEN]; - int pathlen = strlen(ZVOL_FULL_DEV_DIR); - int ret = 1; - - if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) { - return (-1); - } - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "dump is not supported on device '%s'"), arg); - - if ((hdl = libzfs_init()) == NULL) - return (1); - libzfs_print_on_error(hdl, B_TRUE); - - volname = arg + pathlen; - - /* check the configuration of the pool */ - if ((p = strchr(volname, '/')) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "malformed dataset name")); - (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); - return (1); - } else if (p - volname >= ZFS_MAXNAMELEN) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset name is too long")); - (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); - return (1); - } else { - (void) strncpy(poolname, volname, p - volname); - poolname[p - volname] = '\0'; - } - - if ((zhp = zpool_open(hdl, poolname)) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not open pool '%s'"), poolname); - (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); - goto out; - } - config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not obtain vdev configuration for '%s'"), poolname); - (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); - goto out; - } - - verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &top, &toplevels) == 0); - if (toplevels != 1) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' has multiple top level vdevs"), poolname); - (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); - goto out; - } - - if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { - goto out; - } - ret = 0; - -out: - if (zhp) - zpool_close(zhp); - libzfs_fini(hdl); - return (ret); + + /* Wait for the first expected slice to appear. */ + (void) snprintf(path, sizeof (path), "%s/%s%s%s", DISK_ROOT, name, + isdigit(name[strlen(name)-1]) ? "p" : "", FIRST_SLICE); + rval = zpool_label_disk_wait(path, 3000); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " + "detect device partitions on '%s': %d"), path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + /* We can't be to paranoid. Read the label back and verify it. */ + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + rval = zpool_label_disk_check(path); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " + "EFI label on '%s' is damaged. Ensure\nthis device " + "is not in in use, and is functioning properly: %d"), + path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + return 0; } diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 87ffd124f..40d1d2e53 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2608,6 +2608,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, return (-1); } } + if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME && + zvol_remove_link(hdl, zhp->zfs_name) != 0) { + zfs_close(zhp); + zcmd_free_nvlists(&zc); + return (-1); + } zfs_close(zhp); } else { /* @@ -2813,6 +2819,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, if (h != NULL) { if (h->zfs_type == ZFS_TYPE_VOLUME) { *cp = '@'; + err = zvol_create_link(hdl, h->zfs_name); + if (err == 0 && ioctl_err == 0) + err = zvol_create_link(hdl, + zc.zc_value); } else if (newfs || stream_avl) { /* * Track the first/top of hierarchy fs, diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index cb7d87cb2..71f81831b 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -648,7 +649,9 @@ libzfs_fini(libzfs_handle_t *hdl) #endif if (hdl->libzfs_sharetab) (void) fclose(hdl->libzfs_sharetab); +#ifdef HAVE_ZPL zfs_uninit_libshare(hdl); +#endif if (hdl->libzfs_log_str) (void) free(hdl->libzfs_log_str); zpool_free_handles(hdl); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 494e544ea..6f06f4001 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -35,6 +35,8 @@ #include #include #include +#include +#include /* for BLKGETSIZE64 */ #include /* @@ -533,7 +535,11 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) * for its size. So -- gag -- we open the block device to get * its size, and remember it for subsequent VOP_GETATTR(). */ +#if defined(__sun__) || defined(__sun) if (strncmp(path, "/dev/", 5) == 0) { +#else + if (0) { +#endif char *dsk; fd = open64(path, O_RDONLY); if (fd == -1) { @@ -562,6 +568,14 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) } } + if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) { +#ifdef __linux__ + flags |= O_DIRECT; +#endif + /* We shouldn't be writing to block devices in userspace */ + VERIFY(!(flags & FWRITE)); + } + if (flags & FCREAT) old_umask = umask(0); @@ -584,6 +598,16 @@ vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) return (err); } +#ifdef __linux__ + /* In Linux, use an ioctl to get the size of a block device. */ + if (S_ISBLK(st.st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &st.st_size) != 0) { + err = errno; + close(fd); + return (err); + } + } +#endif (void) fcntl(fd, F_SETFD, FD_CLOEXEC); *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); @@ -637,6 +661,16 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, } } +#ifdef __linux__ + if (rc == -1 && errno == EINVAL) { + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order to + * catch the offender. + */ + abort(); + } +#endif if (rc == -1) return (errno); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 37b815f94..ee1140ffb 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1436,6 +1436,7 @@ sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) int sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) { +#ifdef HAVE_ZPL int error; sa_bulk_attr_t bulk; @@ -1452,7 +1453,9 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) } mutex_exit(&hdl->sa_lock); return (error); - +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } #endif diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index d141e43d7..88fd78966 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -67,7 +67,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { }; #ifdef _KERNEL - +#ifdef HAVE_ZPL int zfs_sa_readlink(znode_t *zp, uio_t *uio) { @@ -331,4 +331,5 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) } } +#endif /* HAVE_ZPL */ #endif