diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 3bcdf6e1d..a510d646e 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -183,14 +183,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) nvlist_t *nvroot, *newvd; pendingdev_t *device; uint64_t wholedisk = 0ULL; - uint64_t offline = 0ULL; + uint64_t offline = 0ULL, faulted = 0ULL; uint64_t guid = 0ULL; char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; char devpath[PATH_MAX]; int ret; - boolean_t is_dm = B_FALSE; boolean_t is_sd = B_FALSE; + boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; @@ -211,15 +211,73 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) &enc_sysfs_path); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); - if (offline) - return; /* don't intervene if it was taken offline */ + /* + * Special case: + * + * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH + * entry in their config. For example, on this force-faulted disk: + * + * children[0]: + * type: 'disk' + * id: 0 + * guid: 14309659774640089719 + * path: '/dev/disk/by-vdev/L28' + * whole_disk: 0 + * DTL: 654 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 1161 + * faulted: 1 + * aux_state: 'external' + * children[1]: + * type: 'disk' + * id: 1 + * guid: 16002508084177980912 + * path: '/dev/disk/by-vdev/L29' + * devid: 'dm-uuid-mpath-35000c500a61d68a3' + * phys_path: 'L29' + * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' + * whole_disk: 0 + * DTL: 1028 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 131 + * + * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer + * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name. + */ + if (physpath == NULL && path != NULL) { + /* If path begins with "/dev/disk/by-vdev/" ... */ + if (strncmp(path, DEV_BYVDEV_PATH, + strlen(DEV_BYVDEV_PATH)) == 0) { + /* Set physpath to the char after "/dev/disk/by-vdev" */ + physpath = &path[strlen(DEV_BYVDEV_PATH)]; + } + } - is_dm = zfs_dev_is_dm(path); + /* + * We don't want to autoreplace offlined disks. However, we do want to + * replace force-faulted disks (`zpool offline -f`). Force-faulted + * disks have both offline=1 and faulted=1 in the nvlist. + */ + if (offline && !faulted) { + zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace", + __func__, path); + return; + } + + is_mpath_wholedisk = is_mpath_whole_disk(path); zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" - " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, - physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", + " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', " + "(guid %llu)", + zpool_get_name(zhp), path, + physpath ? physpath : "NULL", + wholedisk ? "is" : "not", + is_mpath_wholedisk? "is" : "not", + labeled ? "is" : "not", + enc_sysfs_path, (long long unsigned int)guid); /* @@ -253,8 +311,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) { - zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", - fullpath, (newstate == VDEV_STATE_HEALTHY) ? + zed_log_msg(LOG_INFO, + " zpool_vdev_online: vdev '%s' ('%s') is " + "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ? "HEALTHY" : "DEGRADED"); return; } @@ -271,11 +330,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * vdev online to trigger a FMA fault by posting an ereport. */ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || - !(wholedisk || is_dm) || (physpath == NULL)) { + !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) { (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " - "not a whole disk for '%s'", fullpath); + "not a blank disk for '%s' ('%s')", fullpath, + physpath); return; } @@ -287,7 +347,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) (void) snprintf(rawpath, sizeof (rawpath), "%s%s", is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); - if (realpath(rawpath, devpath) == NULL && !is_dm) { + if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", rawpath, strerror(errno)); @@ -303,12 +363,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { + zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " + "a bad state (currently %d)", vs->vs_state); return; } nvlist_lookup_string(vdev, "new_devid", &new_devid); - if (is_dm) { + if (is_mpath_wholedisk) { /* Don't label device mapper or multipath disks. */ } else if (!labeled) { /* @@ -522,8 +584,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) * the dp->dd_compare value. */ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || - strcmp(dp->dd_compare, path) != 0) + strcmp(dp->dd_compare, path) != 0) { + zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)", + __func__, dp->dd_compare, path); return; + } zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", dp->dd_prop, path); @@ -571,6 +636,8 @@ zfs_iter_pool(zpool_handle_t *zhp, void *data) ZPOOL_CONFIG_VDEV_TREE, &nvl); zfs_iter_vdev(zhp, nvl, data); } + } else { + zed_log_msg(LOG_INFO, "%s: no config\n", __func__); } /* @@ -619,6 +686,72 @@ devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, return (data.dd_found); } +/* + * Given a device identifier, find any vdevs with a matching by-vdev + * path. Normally we shouldn't need this as the comparison would be + * made earlier in the devphys_iter(). For example, if we were replacing + * /dev/disk/by-vdev/L28, normally devphys_iter() would match the + * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28" + * of the new disk config. However, we've seen cases where + * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's + * an example of a real 2-disk mirror pool where one disk was force + * faulted: + * + * com.delphix:vdev_zap_top: 129 + * children[0]: + * type: 'disk' + * id: 0 + * guid: 14309659774640089719 + * path: '/dev/disk/by-vdev/L28' + * whole_disk: 0 + * DTL: 654 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 1161 + * faulted: 1 + * aux_state: 'external' + * children[1]: + * type: 'disk' + * id: 1 + * guid: 16002508084177980912 + * path: '/dev/disk/by-vdev/L29' + * devid: 'dm-uuid-mpath-35000c500a61d68a3' + * phys_path: 'L29' + * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' + * whole_disk: 0 + * DTL: 1028 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 131 + * + * So in the case above, the only thing we could compare is the path. + * + * We can do this because we assume by-vdev paths are authoritative as physical + * paths. We could not assume this for normal paths like /dev/sda since the + * physical location /dev/sda points to could change over time. + */ +static boolean_t +by_vdev_path_iter(const char *by_vdev_path, const char *devid, + zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = by_vdev_path; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PATH; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + if (strncmp(by_vdev_path, DEV_BYVDEV_PATH, + strlen(DEV_BYVDEV_PATH)) != 0) { + /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */ + return (B_FALSE); + } + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + /* * Given a device identifier, find any vdevs with a matching devid. * On Linux we can match devid directly which is always a whole disk. @@ -683,15 +816,17 @@ guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, static int zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) { - char *devpath = NULL, *devid; + char *devpath = NULL, *devid = NULL; uint64_t pool_guid = 0, vdev_guid = 0; boolean_t is_slice; /* * Expecting a devid string and an optional physical location and guid */ - if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) { + zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__); return (-1); + } (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); @@ -707,6 +842,8 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). + * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since + * by-vdev paths represent physical paths). */ if (devid_iter(devid, zfs_process_add, is_slice)) return (0); @@ -717,6 +854,16 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, is_slice); + if (devpath != NULL) { + /* Can we match a /dev/disk/by-vdev/ path? */ + char by_vdev_path[MAXPATHLEN]; + snprintf(by_vdev_path, sizeof (by_vdev_path), + "/dev/disk/by-vdev/%s", devpath); + if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add, + is_slice)) + return (0); + } + return (0); } diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 94e242360..52b80d8c4 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -215,6 +215,11 @@ zed_udev_monitor(void *arg) if (type != NULL && type[0] != '\0' && strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { + zed_log_msg(LOG_INFO, + "%s: skip %s since it has a %s partition already", + __func__, + udev_device_get_property_value(dev, "DEVNAME"), + part); /* skip and wait for partition event */ udev_device_unref(dev); continue; @@ -229,6 +234,11 @@ zed_udev_monitor(void *arg) sectors = udev_device_get_sysattr_value(dev, "size"); if (sectors != NULL && strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + zed_log_msg(LOG_INFO, + "%s: %s sectors %s < %llu (minimum)", + __func__, + udev_device_get_property_value(dev, "DEVNAME"), + sectors, MINIMUM_SECTORS); udev_device_unref(dev); continue; } diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c index 13f8bd031..83b077927 100644 --- a/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -527,7 +527,7 @@ zfs_dev_is_dm(const char *dev_name) boolean_t zfs_dev_is_whole_disk(const char *dev_name) { - struct dk_gpt *label; + struct dk_gpt *label = NULL; int fd; if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) @@ -613,22 +613,27 @@ zfs_get_underlying_path(const char *dev_name) /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" - * DM_NAME key value has "mpath" prefix + * MPATH_DEVICE_READY is present * DM_UUID key exists * ID_PART_TABLE_TYPE key does not exist or is not gpt + * ID_FS_LABEL key does not exist (disk isn't labeled) */ static boolean_t -udev_mpath_whole_disk(struct udev_device *dev) +is_mpath_udev_sane(struct udev_device *dev) { - const char *devname, *type, *uuid; + const char *devname, *type, *uuid, *label, *mpath_ready; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); + label = udev_device_get_property_value(dev, "ID_FS_LABEL"); + mpath_ready = udev_device_get_property_value(dev, "MPATH_DEVICE_READY"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && - (uuid != NULL)) { + (uuid != NULL) && + (label == NULL) && + (mpath_ready != NULL && strncmp(mpath_ready, "1", 1) == 0)) { return (B_TRUE); } @@ -636,7 +641,11 @@ udev_mpath_whole_disk(struct udev_device *dev) } /* - * Check if a disk is effectively a multipath whole disk + * Check if a disk is a multipath "blank" disk: + * + * 1. The disk has udev values that suggest it's a multipath disk + * 2. The disk is not currently labeled with a filesystem of any type + * 3. There are no partitions on the disk */ boolean_t is_mpath_whole_disk(const char *path) @@ -645,7 +654,6 @@ is_mpath_whole_disk(const char *path) struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; - boolean_t wholedisk = B_FALSE; if (realpath(path, nodepath) == NULL) return (B_FALSE); @@ -660,10 +668,11 @@ is_mpath_whole_disk(const char *path) return (B_FALSE); } - wholedisk = udev_mpath_whole_disk(dev); - + /* Sanity check some udev values */ + boolean_t is_sane = is_mpath_udev_sane(dev); udev_device_unref(dev); - return (wholedisk); + + return (is_sane); } #else /* HAVE_LIBUDEV */