From 9a49d3f3d3bfa26df4e5e54d574cb490f0ee284b Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 3 Jul 2020 11:05:50 -0700 Subject: [PATCH] Add device rebuild feature The device_rebuild feature enables sequential reconstruction when resilvering. Mirror vdevs can be rebuilt in LBA order which may more quickly restore redundancy depending on the pools average block size, overall fragmentation and the performance characteristics of the devices. However, block checksums cannot be verified as part of the rebuild thus a scrub is automatically started after the sequential resilver completes. The new '-s' option has been added to the `zpool attach` and `zpool replace` command to request sequential reconstruction instead of healing reconstruction when resilvering. zpool attach -s zpool replace -s The `zpool status` output has been updated to report the progress of sequential resilvering in the same way as healing resilvering. The one notable difference is that multiple sequential resilvers may be in progress as long as they're operating on different top-level vdevs. The `zpool wait -t resilver` command was extended to wait on sequential resilvers. From this perspective they are no different than healing resilvers. Sequential resilvers cannot be supported for RAIDZ, but are compatible with the dRAID feature being developed. As part of this change the resilver_restart_* tests were moved in to the functional/replacement directory. Additionally, the replacement tests were renamed and extended to verify both resilvering and rebuilding. Original-patch-by: Isaac Huang Reviewed-by: Tony Hutter Reviewed-by: John Poduska Co-authored-by: Mark Maybee Signed-off-by: Brian Behlendorf Closes #10349 --- cmd/zed/agents/zfs_mod.c | 2 +- cmd/zed/agents/zfs_retire.c | 11 +- cmd/zed/zed.d/resilver_finish-start-scrub.sh | 2 + cmd/zpool/zpool_main.c | 376 +++++- cmd/ztest/ztest.c | 16 +- configure.ac | 1 - contrib/pyzfs/libzfs_core/_constants.py | 2 + include/libzfs.h | 7 +- include/sys/Makefile.am | 1 + include/sys/dsl_scan.h | 2 + include/sys/fs/zfs.h | 38 +- include/sys/spa.h | 10 +- include/sys/spa_impl.h | 1 + include/sys/vdev.h | 2 +- include/sys/vdev_impl.h | 17 +- include/sys/vdev_rebuild.h | 97 ++ include/sys/zio_priority.h | 1 + include/zfeature_common.h | 1 + lib/libzfs/libzfs_pool.c | 26 +- lib/libzfs/libzfs_status.c | 47 +- lib/libzfs/libzfs_util.c | 9 + lib/libzpool/Makefile.am | 1 + man/man5/zfs-module-parameters.5 | 36 + man/man5/zpool-features.5 | 29 + man/man8/zpool-attach.8 | 23 +- man/man8/zpool-replace.8 | 14 +- man/man8/zpool-status.8 | 4 +- module/Makefile.bsd | 1 + module/zcommon/zfeature_common.c | 5 + module/zfs/Makefile.in | 1 + module/zfs/dsl_scan.c | 42 +- module/zfs/spa.c | 109 +- module/zfs/spa_misc.c | 29 +- module/zfs/vdev.c | 238 +++- module/zfs/vdev_label.c | 17 + module/zfs/vdev_mirror.c | 5 +- module/zfs/vdev_queue.c | 18 +- module/zfs/vdev_rebuild.c | 1106 +++++++++++++++++ module/zfs/zfs_ioctl.c | 6 +- tests/runfiles/common.run | 13 +- tests/zfs-tests/include/libtest.shlib | 25 +- tests/zfs-tests/tests/functional/Makefile.am | 1 - .../cli_root/zpool_get/zpool_get.cfg | 1 + .../cli_root/zpool_wait/scan/Makefile.am | 1 + .../zpool_wait/scan/zpool_wait_rebuild.ksh | 64 + .../tests/functional/replacement/Makefile.am | 17 +- .../functional/replacement/attach_import.ksh | 67 + .../replacement/attach_multiple.ksh | 111 ++ .../functional/replacement/attach_rebuild.ksh | 173 +++ ...cement_002_pos.ksh => attach_resilver.ksh} | 16 +- .../{replacement_003_pos.ksh => detach.ksh} | 10 +- .../replacement/rebuild_disabled_feature.ksh | 78 ++ .../replacement/rebuild_multiple.ksh | 126 ++ .../functional/replacement/rebuild_raidz.ksh | 70 ++ .../functional/replacement/replace_import.ksh | 67 + .../replacement/replace_rebuild.ksh | 158 +++ ...ement_001_pos.ksh => replace_resilver.ksh} | 9 +- .../functional/replacement/replacement.cfg | 5 + .../resilver_restart_001.ksh | 39 +- .../resilver_restart_002.ksh | 24 +- .../functional/replacement/scrub_cancel.ksh | 112 ++ .../tests/functional/resilver/Makefile.am | 9 - .../tests/functional/resilver/cleanup.ksh | 31 - .../tests/functional/resilver/resilver.cfg | 32 - .../tests/functional/resilver/setup.ksh | 31 - 65 files changed, 3281 insertions(+), 362 deletions(-) create mode 100644 include/sys/vdev_rebuild.h create mode 100644 module/zfs/vdev_rebuild.c create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_import.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh rename tests/zfs-tests/tests/functional/replacement/{replacement_002_pos.ksh => attach_resilver.ksh} (92%) rename tests/zfs-tests/tests/functional/replacement/{replacement_003_pos.ksh => detach.ksh} (94%) create mode 100755 tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/replace_import.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh rename tests/zfs-tests/tests/functional/replacement/{replacement_001_pos.ksh => replace_resilver.ksh} (95%) rename tests/zfs-tests/tests/functional/{resilver => replacement}/resilver_restart_001.ksh (88%) rename tests/zfs-tests/tests/functional/{resilver => replacement}/resilver_restart_002.ksh (80%) create mode 100755 tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh delete mode 100644 tests/zfs-tests/tests/functional/resilver/Makefile.am delete mode 100755 tests/zfs-tests/tests/functional/resilver/cleanup.ksh delete mode 100644 tests/zfs-tests/tests/functional/resilver/resilver.cfg delete mode 100755 tests/zfs-tests/tests/functional/resilver/setup.ksh diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 1094d25dd..8d0a3b420 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -437,7 +437,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index f3dbb24b8..665fb216d 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -237,7 +237,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) dev_name, basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE) == 0) { + replacement, B_TRUE, B_FALSE) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); @@ -319,12 +319,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); + /* * If this is a resource notifying us of device removal then simply * check for an available spare and continue unless the device is a * l2arc vdev, in which case we just offline it. */ - if (strcmp(class, "resource.fs.zfs.removed") == 0) { + if (strcmp(class, "resource.fs.zfs.removed") == 0 || + (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_REMOVED)) { char *devtype; char *devname; @@ -365,8 +369,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * healthy ones so we need to confirm the actual state value. */ if (strcmp(class, "resource.fs.zfs.statechange") == 0 && - nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, - &state) == 0 && state == VDEV_STATE_HEALTHY) { + state == VDEV_STATE_HEALTHY) { zfs_vdev_repair(hdl, nvl); return; } diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh index 6f9c0b309..c7cfd1ddb 100755 --- a/cmd/zed/zed.d/resilver_finish-start-scrub.sh +++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh @@ -5,10 +5,12 @@ # Exit codes: # 1: Internal error # 2: Script wasn't enabled in zed.rc +# 3: Scrubs are automatically started for sequential resilvers [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" [ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2 +[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3 [ -n "${ZEVENT_POOL}" ] || exit 1 [ -n "${ZEVENT_SUBCLASS}" ] || exit 1 zed_check_cmd "${ZPOOL}" || exit 1 diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index ee6c479eb..cdf5511fe 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -337,7 +337,7 @@ get_usage(zpool_help_t idx) return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: - return (gettext("\tattach [-fw] [-o property=value] " + return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); @@ -380,7 +380,7 @@ get_usage(zpool_help_t idx) case HELP_ONLINE: return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: - return (gettext("\treplace [-fw] [-o property=value] " + return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); @@ -2077,10 +2077,10 @@ health_str_to_color(const char *health) */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, - nvlist_t *nv, int depth, boolean_t isspare) + nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; - uint_t c, children; + uint_t c, i, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; @@ -2266,6 +2266,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } + /* The top-level vdevs have the rebuild stats */ + if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && + children == 0) { + if (vs->vs_rebuild_processed != 0) { + (void) printf(gettext(" (resilvering)")); + } + } + if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); @@ -2295,11 +2303,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; + /* Provide vdev_rebuild_stats to children if available */ + if (vrs == NULL) { + (void) nvlist_lookup_uint64_array(nv, + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i); + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); - print_status_config(zhp, cb, vname, child[c], depth + 2, - isspare); + isspare, vrs); free(vname); } } @@ -2468,7 +2482,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, - B_FALSE); + B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); @@ -2622,6 +2636,7 @@ show_import(nvlist_t *config) break; case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); @@ -6118,6 +6133,7 @@ static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; + boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; @@ -6128,7 +6144,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) int ret; /* check options */ - while ((c = getopt(argc, argv, "fo:w")) != -1) { + while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -6146,6 +6162,9 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; + case 's': + rebuild = B_TRUE; + break; case 'w': wait = B_TRUE; break; @@ -6230,7 +6249,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } - ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, + rebuild); if (ret == 0 && wait) ret = zpool_wait(zhp, @@ -6244,9 +6264,10 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) } /* - * zpool replace [-fw] [-o property=value] + * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * @@ -6260,9 +6281,10 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-fw] [-o property=value] + * zpool attach [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering to complete before returning * @@ -7131,20 +7153,41 @@ zpool_do_trim(int argc, char **argv) return (error); } +/* + * Converts a total number of seconds to a human readable string broken + * down in to days/hours/minutes/seconds. + */ +static void +secs_to_dhms(uint64_t total, char *buf) +{ + uint64_t days = total / 60 / 60 / 24; + uint64_t hours = (total / 60 / 60) % 24; + uint64_t mins = (total / 60) % 60; + uint64_t secs = (total % 60); + + if (days > 0) { + (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", + (u_longlong_t)days, (u_longlong_t)hours, + (u_longlong_t)mins, (u_longlong_t)secs); + } else { + (void) sprintf(buf, "%02llu:%02llu:%02llu", + (u_longlong_t)hours, (u_longlong_t)mins, + (u_longlong_t)secs); + } +} + /* * Print out detailed scrub status. */ static void -print_scan_status(pool_scan_stat_t *ps) +print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t total_secs_left; - uint64_t elapsed, secs_left, mins_left, hours_left, days_left; uint64_t pass_scanned, scanned, pass_issued, issued, total; - uint64_t scan_rate, issue_rate; + uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; - char srate_buf[7], irate_buf[7]; + char srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); @@ -7168,26 +7211,18 @@ print_scan_status(pool_scan_stat_t *ps) /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { - total_secs_left = end - start; - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); + secs_to_dhms(end - start, time_buf); if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub repaired %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilvered %s " - "in %llu days %02llu:%02llu:%02llu " - "with %llu errors on %s"), processed_buf, - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left, - (u_longlong_t)ps->pss_errors, ctime(&end)); + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { @@ -7235,13 +7270,9 @@ print_scan_status(pool_scan_stat_t *ps) scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; - total_secs_left = (issue_rate != 0 && total >= issued) ? + uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? ((total - issued) / issue_rate) : UINT64_MAX; - - days_left = total_secs_left / 60 / 60 / 24; - hours_left = (total_secs_left / 60 / 60) % 24; - mins_left = (total_secs_left / 60) % 60; - secs_left = (total_secs_left % 60); + secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); @@ -7271,10 +7302,84 @@ print_scan_status(pool_scan_stat_t *ps) if (pause == 0) { if (total_secs_left != UINT64_MAX && issue_rate >= 10 * 1024 * 1024) { - (void) printf(gettext(", %llu days " - "%02llu:%02llu:%02llu to go\n"), - (u_longlong_t)days_left, (u_longlong_t)hours_left, - (u_longlong_t)mins_left, (u_longlong_t)secs_left); + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +static void +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +{ + if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) + return; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + uint64_t bytes_scanned = vrs->vrs_bytes_scanned; + uint64_t bytes_issued = vrs->vrs_bytes_issued; + uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; + uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / + (vrs->vrs_pass_time_ms + 1)) * 1000; + uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / + (vrs->vrs_pass_time_ms + 1)) * 1000; + double scan_pct = MIN((double)bytes_scanned * 100 / + (bytes_est + 1), 100); + + /* Format all of the numbers we will be reporting */ + char bytes_scanned_buf[7], bytes_issued_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; + zfs_nicebytes(bytes_scanned, bytes_scanned_buf, + sizeof (bytes_scanned_buf)); + zfs_nicebytes(bytes_issued, bytes_issued_buf, + sizeof (bytes_issued_buf)); + zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, + sizeof (bytes_rebuilt_buf)); + zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + + time_t start = vrs->vrs_start_time; + time_t end = vrs->vrs_end_time; + + /* Rebuild is finished or canceled. */ + if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { + secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); + (void) printf(gettext("resilvered (%s) %s in %s " + "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, + time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { + (void) printf(gettext("resilver (%s) canceled on %s"), + vdev_name, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + (void) printf(gettext("resilver (%s) in progress since %s"), + vdev_name, ctime(&start)); + } + + assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); + + secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / + MAX(scan_rate, 1), time_buf); + + (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " + "%s total\n"), bytes_scanned_buf, scan_rate_buf, + bytes_issued_buf, issue_rate_buf, bytes_est_buf); + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + bytes_rebuilt_buf, scan_pct); + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + if (scan_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); @@ -7285,9 +7390,38 @@ print_scan_status(pool_scan_stat_t *ps) } /* - * As we don't scrub checkpointed blocks, we want to warn the - * user that we skipped scanning some blocks if a checkpoint exists - * or existed at any time during the scan. + * Print rebuild status for top-level vdevs. + */ +static void +print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + char *name = zpool_vdev_name(g_zfs, zhp, + child[c], VDEV_NAME_TYPE_ID); + print_rebuild_status_impl(vrs, name); + free(name); + } + } +} + +/* + * As we don't scrub checkpointed blocks, we want to warn the user that we + * skipped scanning some blocks if a checkpoint exists or existed at any + * time during the scan. If a sequential instead of healing reconstruction + * was performed then the blocks were reconstructed. However, their checksums + * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) @@ -7318,6 +7452,95 @@ print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) } } +/* + * Returns B_TRUE if there is an active rebuild in progress. Otherwise, + * B_FALSE is returned and 'rebuild_end_time' is set to the end time for + * the last completed (or cancelled) rebuild. + */ +static boolean_t +check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) +{ + nvlist_t **child; + uint_t children; + boolean_t rebuilding = B_FALSE; + uint64_t end_time = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + + if (vrs->vrs_end_time > end_time) + end_time = vrs->vrs_end_time; + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + rebuilding = B_TRUE; + end_time = 0; + break; + } + } + } + + if (rebuild_end_time != NULL) + *rebuild_end_time = end_time; + + return (rebuilding); +} + +/* + * Print the scan status. + */ +static void +print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + uint64_t rebuild_end_time = 0, resilver_end_time = 0; + boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t active_resilver = B_FALSE; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *ps = NULL; + uint_t c; + + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c) == 0) { + if (ps->pss_func == POOL_SCAN_RESILVER) { + resilver_end_time = ps->pss_end_time; + active_resilver = (ps->pss_state == DSS_SCANNING); + } + + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); + have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + } + + boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); + boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); + + /* Always print the scrub status when available. */ + if (have_scrub) + print_scan_scrub_resilver_status(ps); + + /* + * When there is an active resilver or rebuild print its status. + * Otherwise print the status of the last resilver or rebuild. + */ + if (active_resilver || (!active_rebuild && have_resilver && + resilver_end_time && resilver_end_time > rebuild_end_time)) { + print_scan_scrub_resilver_status(ps); + } else if (active_rebuild || (!active_resilver && have_rebuild && + rebuild_end_time && rebuild_end_time > resilver_end_time)) { + print_rebuild_status(zhp, nvroot); + } + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_scan_warning(ps, pcs); +} + /* * Print out detailed removal status. */ @@ -7504,7 +7727,7 @@ print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); - print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); + print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } @@ -7524,7 +7747,8 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); - print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); + print_status_config(zhp, cb, name, l2cache[i], 2, + B_FALSE, NULL); free(name); } } @@ -7718,6 +7942,7 @@ status_callback(zpool_handle_t *zhp, void *data) break; case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " @@ -7727,6 +7952,16 @@ status_callback(zpool_handle_t *zhp, void *data) "complete.\n")); break; + case ZPOOL_STATUS_REBUILD_SCRUB: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices have " + "been sequentially resilvered, scrubbing\n\tthe pool " + "is recommended.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " + "verify all data checksums.\n")); + break; + case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " @@ -7951,18 +8186,16 @@ status_callback(zpool_handle_t *zhp, void *data) nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_checkpoint_stat_t *pcs = NULL; - pool_scan_stat_t *ps = NULL; pool_removal_stat_t *prs = NULL; + print_scan_status(zhp, nvroot); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + print_removal_status(zhp, prs); + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - (void) nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); - print_scan_status(ps); - print_checkpoint_scan_warning(ps, pcs); - print_removal_status(zhp, prs); print_checkpoint_status(pcs); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, @@ -7987,7 +8220,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, - B_FALSE); + B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); @@ -9543,6 +9776,36 @@ vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) return (bytes_remaining); } +/* Add up the total number of bytes left to rebuild across top-level vdevs */ +static uint64_t +vdev_activity_top_remaining(nvlist_t *nv) +{ + uint64_t bytes_remaining = 0; + nvlist_t **child; + uint_t children; + int error; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + error = nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); + if (error == 0) { + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + bytes_remaining += (vrs->vrs_bytes_est - + vrs->vrs_bytes_rebuilt); + } + } + } + + return (bytes_remaining); +} + /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) @@ -9652,6 +9915,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; + } else if (check_rebuilding(nvroot, NULL)) { + bytes_rem[ZPOOL_WAIT_RESILVER] = + vdev_activity_top_remaining(nvroot); } bytes_rem[ZPOOL_WAIT_INITIALIZE] = diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index ce748da18..ca38271cc 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -3507,7 +3507,16 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); - error = spa_vdev_attach(spa, oldguid, root, replacing); + /* + * When supported select either a healing or sequential resilver. + */ + boolean_t rebuilding = B_FALSE; + if (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops) { + rebuilding = !!ztest_random(2); + } + + error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); nvlist_free(root); @@ -3527,10 +3536,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || - error == ZFS_ERR_DISCARDING_CHECKPOINT) + error == ZFS_ERR_DISCARDING_CHECKPOINT || + error == ZFS_ERR_RESILVER_IN_PROGRESS || + error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; - /* XXX workaround 6690467 */ if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " "returned %d, expected %d", diff --git a/configure.ac b/configure.ac index e405ddb57..c7f813d19 100644 --- a/configure.ac +++ b/configure.ac @@ -368,7 +368,6 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/rename_dirs/Makefile tests/zfs-tests/tests/functional/replacement/Makefile tests/zfs-tests/tests/functional/reservation/Makefile - tests/zfs-tests/tests/functional/resilver/Makefile tests/zfs-tests/tests/functional/rootpool/Makefile tests/zfs-tests/tests/functional/rsend/Makefile tests/zfs-tests/tests/functional/scrub_mirror/Makefile diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py index 5c285164b..50dca67f3 100644 --- a/contrib/pyzfs/libzfs_core/_constants.py +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -95,6 +95,8 @@ zfs_errno = enum_with_offset(1024, [ 'ZFS_ERR_EXPORT_IN_PROGRESS', 'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR', 'ZFS_ERR_STREAM_TRUNCATED', + 'ZFS_ERR_RESILVER_IN_PROGRESS', + 'ZFS_ERR_REBUILD_IN_PROGRESS', ], {} ) diff --git a/include/libzfs.h b/include/libzfs.h index 64a0a2035..873e8f304 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -79,7 +79,7 @@ typedef enum zfs_error { EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ - EZFS_RESILVERING, /* currently resilvering */ + EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ @@ -148,6 +148,7 @@ typedef enum zfs_error { EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ + EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_UNKNOWN } zfs_error_t; @@ -297,7 +298,7 @@ extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); extern int zpool_vdev_attach(zpool_handle_t *, const char *, - const char *, nvlist_t *, int); + const char *, nvlist_t *, int, boolean_t); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); extern int zpool_vdev_remove_cancel(zpool_handle_t *); @@ -387,6 +388,8 @@ typedef enum { ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ + ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ /* * Finally, the following indicates a healthy pool. diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index ce781aa4c..0659c6419 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -89,6 +89,7 @@ COMMON_H = \ vdev_initialize.h \ vdev_raidz.h \ vdev_raidz_impl.h \ + vdev_rebuild.h \ vdev_removal.h \ vdev_trim.h \ xvattr.h \ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index bcb896da3..8f929207d 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -42,6 +42,8 @@ struct dsl_dataset; struct dsl_pool; struct dmu_tx; +extern int zfs_scan_suspend_progress; + /* * All members of this structure must be uint64_t, for byteswap * purposes. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 575a4af51..1bfd7a485 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -704,6 +704,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" +#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ @@ -730,6 +731,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ +#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" /* * The persistent vdev state is stored as separate values rather than a single @@ -778,6 +780,9 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" +#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ + "org.openzfs:vdev_rebuild" + #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" @@ -991,6 +996,21 @@ typedef enum dsl_scan_state { DSS_NUM_STATES } dsl_scan_state_t; +typedef struct vdev_rebuild_stat { + uint64_t vrs_state; /* vdev_rebuild_state_t */ + uint64_t vrs_start_time; /* time_t */ + uint64_t vrs_end_time; /* time_t */ + uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ + uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ + uint64_t vrs_bytes_issued; /* read bytes issued */ + uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrs_bytes_est; /* total bytes to scan */ + uint64_t vrs_errors; /* scanning errors */ + uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ + uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ + uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ +} vdev_rebuild_stat_t; + /* * Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER. The ordering * of this enum must be maintained to ensure the errata identifiers map to @@ -1047,6 +1067,7 @@ typedef struct vdev_stat { uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ + uint64_t vs_rebuild_processed; /* bytes rebuilt */ } vdev_stat_t; /* @@ -1178,6 +1199,13 @@ typedef enum { VDEV_TRIM_COMPLETE, } vdev_trim_state_t; +typedef enum { + VDEV_REBUILD_NONE, + VDEV_REBUILD_ACTIVE, + VDEV_REBUILD_CANCELED, + VDEV_REBUILD_COMPLETE, +} vdev_rebuild_state_t; + /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl @@ -1337,6 +1365,8 @@ typedef enum { ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, + ZFS_ERR_RESILVER_IN_PROGRESS, + ZFS_ERR_REBUILD_IN_PROGRESS, } zfs_errno_t; /* @@ -1478,7 +1508,12 @@ typedef enum { * given payloads: * * ESC_ZFS_RESILVER_START - * ESC_ZFS_RESILVER_END + * ESC_ZFS_RESILVER_FINISH + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING + * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * @@ -1532,6 +1567,7 @@ typedef enum { #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" +#define ZFS_EV_RESILVER_TYPE "resilver_type" #ifdef __cplusplus } diff --git a/include/sys/spa.h b/include/sys/spa.h index 5806dda41..9b96eb1f8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -790,17 +790,12 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 - -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 +#define SPA_ASYNC_REBUILD_DONE 0x2000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); + int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); @@ -988,6 +983,7 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 6481d5397..2c52cb666 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/include/sys/vdev.h b/include/sys/vdev.h index d93ef78f1..a7e880636 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -73,7 +73,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - int scrub_done); + boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 56407a191..b9298c62d 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -295,13 +296,26 @@ struct vdev { uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ - /* for limiting outstanding I/Os (initialize and TRIM) */ + /* Rebuild related */ + boolean_t vdev_rebuilding; + boolean_t vdev_rebuild_exit_wanted; + boolean_t vdev_rebuild_cancel_wanted; + boolean_t vdev_rebuild_reset_wanted; + kmutex_t vdev_rebuild_lock; + kcondvar_t vdev_rebuild_cv; + kthread_t *vdev_rebuild_thread; + vdev_rebuild_t vdev_rebuild_config; + + /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; + kmutex_t vdev_rebuild_io_lock; + kcondvar_t vdev_rebuild_io_cv; + uint64_t vdev_rebuild_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -358,6 +372,7 @@ struct vdev { uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ + uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h new file mode 100644 index 000000000..3d4b8cc46 --- /dev/null +++ b/include/sys/vdev_rebuild.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_REBUILD_H +#define _SYS_VDEV_REBUILD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Number of entries in the physical vdev_rebuild_phys structure. This + * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS. + */ +#define REBUILD_PHYS_ENTRIES 12 + +/* + * On-disk rebuild configuration and state. When adding new fields they + * must be added to the end of the structure. + */ +typedef struct vdev_rebuild_phys { + uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */ + uint64_t vrp_last_offset; /* last rebuilt offset */ + uint64_t vrp_min_txg; /* minimum missing txg */ + uint64_t vrp_max_txg; /* maximum missing txg */ + uint64_t vrp_start_time; /* start time */ + uint64_t vrp_end_time; /* end time */ + uint64_t vrp_scan_time_ms; /* total run time in ms */ + uint64_t vrp_bytes_scanned; /* alloc bytes scanned */ + uint64_t vrp_bytes_issued; /* read bytes rebuilt */ + uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrp_bytes_est; /* total bytes to scan */ + uint64_t vrp_errors; /* errors during rebuild */ +} vdev_rebuild_phys_t; + +/* + * The vdev_rebuild_t describes the current state and how a top-level vdev + * should be rebuilt. The core elements are the top-vdev, the metaslab being + * rebuilt, range tree containing the allocted extents and the on-disk state. + */ +typedef struct vdev_rebuild { + vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ + metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ + range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + + /* In-core state and progress */ + uint64_t vr_scan_offset[TXG_SIZE]; + uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + + /* Per-rebuild pass statistics for calculating bandwidth */ + uint64_t vr_pass_start_time; + uint64_t vr_pass_bytes_scanned; + uint64_t vr_pass_bytes_issued; + + /* On-disk state updated by vdev_rebuild_zap_update_sync() */ + vdev_rebuild_phys_t vr_rebuild_phys; +} vdev_rebuild_t; + +boolean_t vdev_rebuild_active(vdev_t *); + +int vdev_rebuild_load(vdev_t *); +void vdev_rebuild(vdev_t *); +void vdev_rebuild_stop_wait(vdev_t *); +void vdev_rebuild_stop_all(spa_t *); +void vdev_rebuild_restart(spa_t *); +void vdev_rebuild_clear_sync(void *, dmu_tx_t *); +int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REBUILD_H */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 0b422904e..2d8e7fc36 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -31,6 +31,7 @@ typedef enum zio_priority { ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ + ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ } zio_priority_t; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2d8767d5b..7e19a62e2 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -74,6 +74,7 @@ typedef enum spa_feature { SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURE_LIVELIST, + SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 11b3d4cd9..f848cb3cf 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2446,7 +2446,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); - if (ps && ps->pss_func == POOL_SCAN_SCRUB) { + if (ps && ps->pss_func == POOL_SCAN_SCRUB && + ps->pss_state == DSS_SCANNING) { if (cmd == POOL_SCRUB_PAUSE) return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg)); else @@ -3128,8 +3129,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) * If 'replacing' is specified, the new disk will replace the old one. */ int -zpool_vdev_attach(zpool_handle_t *zhp, - const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) +zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, + const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) { zfs_cmd_t zc = {"\0"}; char msg[1024]; @@ -3164,6 +3165,14 @@ zpool_vdev_attach(zpool_handle_t *zhp, verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); zc.zc_cookie = replacing; + zc.zc_simple = rebuild; + + if (rebuild && + zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the loaded zfs module doesn't support device rebuilds")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); + } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { @@ -3224,16 +3233,21 @@ zpool_vdev_attach(zpool_handle_t *zhp, uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); - if (islog) + if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); - else if (version >= SPA_VERSION_MULTI_REPLACE) + } else if (rebuild) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "only mirror vdevs support sequential " + "reconstruction")); + } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); - else + } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); + } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index ebf497db6..67b8ea33e 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -84,6 +84,8 @@ static char *zfs_msgid_table[] = { * ZPOOL_STATUS_RESILVERING * ZPOOL_STATUS_OFFLINE_DEV * ZPOOL_STATUS_REMOVED_DEV + * ZPOOL_STATUS_REBUILDING + * ZPOOL_STATUS_REBUILD_SCRUB * ZPOOL_STATUS_OK */ }; @@ -195,7 +197,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * - Check for any data errors * - Check for any faulted or missing devices in a replicated config * - Look for any devices showing errors - * - Check for any resilvering devices + * - Check for any resilvering or rebuilding devices * * There can obviously be multiple errors within a single pool, so this routine * only picks the most damaging of all the current errors to report. @@ -233,6 +235,49 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) ps->pss_state == DSS_SCANNING) return (ZPOOL_STATUS_RESILVERING); + /* + * Currently rebuilding a vdev, check top-level vdevs. + */ + vdev_rebuild_stat_t *vrs = NULL; + nvlist_t **child; + uint_t c, i, children; + uint64_t rebuild_end_time = 0; + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) { + uint64_t state = vrs->vrs_state; + + if (state == VDEV_REBUILD_ACTIVE) { + return (ZPOOL_STATUS_REBUILDING); + } else if (state == VDEV_REBUILD_COMPLETE && + vrs->vrs_end_time > rebuild_end_time) { + rebuild_end_time = vrs->vrs_end_time; + } + } + } + + /* + * If we can determine when the last scrub was run, and it + * was before the last rebuild completed, then recommend + * that the pool be scrubbed to verify all checksums. When + * ps is NULL we can infer the pool has never been scrubbed. + */ + if (rebuild_end_time > 0) { + if (ps != NULL) { + if ((ps->pss_state == DSS_FINISHED && + ps->pss_func == POOL_SCAN_SCRUB && + rebuild_end_time > ps->pss_end_time) || + ps->pss_state == DSS_NONE) + return (ZPOOL_STATUS_REBUILD_SCRUB); + } else { + return (ZPOOL_STATUS_REBUILD_SCRUB); + } + } + } + /* * The multihost property is set and the pool may be active. */ diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 21bd8289c..2f4aaed32 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -286,6 +286,9 @@ libzfs_error_description(libzfs_handle_t *hdl) "resilver_defer feature")); case EZFS_EXPORT_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "pool export in progress")); + case EZFS_REBUILDING: + return (dgettext(TEXT_DOMAIN, "currently sequentially " + "resilvering")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -693,6 +696,12 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_EXPORT_IN_PROGRESS: zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); break; + case ZFS_ERR_RESILVER_IN_PROGRESS: + zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); + break; + case ZFS_ERR_REBUILD_IN_PROGRESS: + zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); + break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 46befa7d4..06b89fe0a 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -132,6 +132,7 @@ KERNEL_C = \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_raidz_math_powerpc_altivec.c \ + vdev_rebuild.c \ vdev_removal.c \ vdev_root.c \ vdev_trim.c \ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 687b85d0b..3fbd3c67f 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1862,6 +1862,30 @@ queue's min_active. See the section "ZFS I/O SCHEDULER". Default value: \fB1,000\fR. .RE +.sp +.ne 2 +.na +\fBzfs_vdev_rebuild_max_active\fR (int) +.ad +.RS 12n +Maximum sequential resilver I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB3\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_rebuild_min_active\fR (int) +.ad +.RS 12n +Minimum sequential resilver I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + .sp .ne 2 .na @@ -2707,6 +2731,18 @@ Include cache hits in read history Use \fB1\fR for yes and \fB0\fR for no (default). .RE +.sp +.ne 2 +.na +\fBzfs_rebuild_max_segment\fR (ulong) +.ad +.RS 12n +Maximum read segment size to issue when sequentially resilvering a +top-level vdev. +.sp +Default value: \fB1,048,576\fR. +.RE + .sp .ne 2 .na diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index e7a61957f..3f690c334 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -255,6 +255,35 @@ This feature becomes \fBactive\fR when a bookmark is created and will be returned to the \fBenabled\fR state when all bookmarks with these fields are destroyed. .RE +.sp +.ne 2 +.na +\fBdevice_rebuild\fR +.ad +.RS 4n +.TS +l l . +GUID org.openzfs:device_rebuild +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables the ability for the \fBzpool attach\fR and \fBzpool +replace\fR subcommands to perform sequential reconstruction (instead of +healing reconstruction) when resilvering. + +Sequential reconstruction resilvers a device in LBA order without immediately +verifying the checksums. Once complete a scrub is started which then verifies +the checksums. This approach allows full redundancy to be restored to the pool +in the minimum amount of time. This two phase approach will take longer than a +healing resilver when the time to verify the checksums is included. However, +unless there is additional pool damage no checksum errors should be reported +by the scrub. This feature is incompatible with raidz configurations. + +This feature becomes \fBactive\fR while a sequential resilver is in progress, +and returns to \fBenabled\fR when the resilver completes. +.RE + .sp .ne 2 .na diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 index be0be4e07..585357b96 100644 --- a/man/man8/zpool-attach.8 +++ b/man/man8/zpool-attach.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd May 15, 2020 .Dt ZPOOL-ATTACH 8 .Os Linux .Sh NAME @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm .Cm attach -.Op Fl fw +.Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool device new_device .Sh DESCRIPTION @@ -44,7 +44,7 @@ .It Xo .Nm .Cm attach -.Op Fl fw +.Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool device new_device .Xc @@ -68,22 +68,29 @@ is part of a two-way mirror, attaching creates a three-way mirror, and so on. In either case, .Ar new_device -begins to resilver immediately. +begins to resilver immediately and any running scrub is cancelled. .Bl -tag -width Ds .It Fl f Forces use of .Ar new_device , even if it appears to be in use. Not all devices can be overridden in this manner. -.It Fl w -Waits until -.Ar new_device -has finished resilvering before returning. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the .Xr zpoolprops 8 manual page for a list of valid properties that can be set. The only property supported at the moment is ashift. +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. +.It Fl w +Waits until +.Ar new_device +has finished resilvering before returning. .El .El .Sh SEE ALSO diff --git a/man/man8/zpool-replace.8 b/man/man8/zpool-replace.8 index 933fb4ae9..5e639feaf 100644 --- a/man/man8/zpool-replace.8 +++ b/man/man8/zpool-replace.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd May 15, 2020 .Dt ZPOOL-REPLACE 8 .Os Linux .Sh NAME @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm .Cm replace -.Op Fl fw +.Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool Ar device Op Ar new_device .Sh DESCRIPTION @@ -44,7 +44,7 @@ .It Xo .Nm .Cm replace -.Op Fl fw +.Op Fl fsw .Op Fl o Ar property Ns = Ns Ar value .Ar pool Ar device Op Ar new_device .Xc @@ -56,6 +56,7 @@ This is equivalent to attaching .Ar new_device , waiting for it to resilver, and then detaching .Ar old_device . +Any in progress scrub will be cancelled. .Pp The size of .Ar new_device @@ -86,6 +87,13 @@ Sets the given pool properties. See the manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. .It Fl w Waits until the replacement has completed before returning. .El diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 7364bf635..66e335995 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd May 15, 2020 .Dt ZPOOL-STATUS 8 .Os Linux .Sh NAME @@ -59,7 +59,7 @@ is specified, then the status of each pool in the system is displayed. For more information on pool and device health, see the .Em Device Failure and Recovery section of -.Xr zpoolconcepts 8. +.Xr zpoolconcepts 8 . .Pp If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 7c83113ac..1ac9d00e7 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -251,6 +251,7 @@ SRCS+= abd.c \ vdev_raidz.c \ vdev_raidz_math.c \ vdev_raidz_math_scalar.c \ + vdev_rebuild.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index cf3006721..302d48570 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -570,6 +570,11 @@ zpool_feature_init(void) "com.datto:resilver_defer", "resilver_defer", "Support for deferring new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_DEVICE_REBUILD, + "org.openzfs:device_rebuild", "device_rebuild", + "Support for sequential device rebuilds", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 7ea976d12..9ddcd6c33 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_rebuild.o $(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += vdev_trim.o diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 895ffbf0a..712af664e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -704,8 +704,9 @@ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) return (SET_ERROR(EBUSY)); return (0); @@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, NULL, + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); + nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } @@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + /* + * When starting a resilver clear any existing rebuild state. + * This is required to prevent stale rebuild status from + * being reported when a rebuild is run, then a resilver and + * finally a scrub. In which case only the scrub status + * should be reported by 'zpool status'. + */ + if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + vdev_rebuild_clear_sync( + (void *)(uintptr_t)vd->vdev_id, tx); + } + } } /* back to the generic stuff */ @@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - scn->scn_phys.scn_max_txg, B_TRUE); + scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + if (scn->scn_phys.scn_min_txg) { + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, + ESC_ZFS_RESILVER_FINISH); + nvlist_free(aux); + } else { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_SCRUB_FINISH); + } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, - 0, B_TRUE); + 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 943330886..6b60227d2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa) vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - vdev_dtl_reassess(rvd, 0, 0, B_FALSE); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); @@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) update_config_cache); /* - * Check all DTLs to see if anything needs resilvering. + * Check if a rebuild was in progress and if so resume it. + * Then check all DTLs to see if anything needs resilvering. + * The resilver will be deferred if a rebuild was started. */ - if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + if (vdev_rebuild_active(spa->spa_root_vdev)) { + vdev_rebuild_restart(spa); + } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); + } /* * Log the fact that we booted up (so that we can detect if @@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); } /* @@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. + * + * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) + * should be performed instead of traditional healing reconstruction. From + * an administrators perspective these are both resilver operations. */ int -spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, + int rebuild) { uint64_t txg, dtl_max_txg; - vdev_t *rvd __maybe_unused = spa->spa_root_vdev; + vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; @@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) return (spa_vdev_exit(spa, NULL, txg, error)); } + if (rebuild) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + if (dsl_scan_resilvering(spa_get_dsl(spa))) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RESILVER_IN_PROGRESS)); + } else { + if (vdev_rebuild_active(rvd)) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_REBUILD_IN_PROGRESS)); + } + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); @@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + if (rebuild) { + /* + * For rebuilds, the parent vdev must support reconstruction + * using only space maps. This means the only allowable + * parents are the root vdev or a mirror vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + } + if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root @@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) - return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid @@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } } - /* mark the device being resilvered */ - newvd->vdev_resilver_txg = txg; - /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_dirty(tvd, VDD_DTL, newvd, txg); /* - * Schedule the resilver to restart in the future. We do this to - * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. We do not do this if resilvers have been - * deferred. + * Schedule the resilver or rebuild to restart in the future. We do + * this to ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. */ - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_defer_resilver(newvd); - else - dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); + } else { + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } + } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) ASSERT(spa_writeable(spa)); - txg = spa_vdev_enter(spa); + txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa) } spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * If a detach was not performed above replace waiters will not have + * been notified. In which case we must do so now. + */ + spa_notify_waiters(spa); } /* @@ -7970,10 +8020,22 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); + /* + * If any devices are done replacing, detach them. Then if no + * top-level vdevs are rebuilding attempt to kick off a scrub. + */ + if (tasks & SPA_ASYNC_REBUILD_DONE) { + spa_vdev_resilver_done(spa); + + if (!vdev_rebuild_active(spa->spa_root_vdev)) + (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); + } + /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && + !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); @@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: + if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + break; + /* fall through */ case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 61cefa3dd..4c884409a 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1165,6 +1165,30 @@ spa_vdev_enter(spa_t *spa) return (spa_vdev_config_enter(spa)); } +/* + * The same as spa_vdev_enter() above but additionally takes the guid of + * the vdev being detached. When there is a rebuild in process it will be + * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). + * The rebuild is canceled if only a single child remains after the detach. + */ +uint64_t +spa_vdev_detach_enter(spa_t *spa, uint64_t guid) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + + vdev_autotrim_stop_all(spa); + + if (guid != 0) { + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd) { + vdev_rebuild_stop_wait(vd->vdev_top); + } + } + + return (spa_vdev_config_enter(spa)); +} + /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while @@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) /* * Reassess the DTLs. */ - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; @@ -1271,6 +1295,7 @@ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); + vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); @@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) } if (vd != NULL || error == 0) - vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE); + vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 03360120a..27ac17fea 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); @@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } + txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, @@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + &vd->vdev_rebuild_txg); + if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); @@ -890,6 +902,7 @@ vdev_free(vdev_t *vd) ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); + mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); @@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); + mutex_destroy(&vd->vdev_rebuild_lock); + mutex_destroy(&vd->vdev_rebuild_io_lock); + cv_destroy(&vd->vdev_rebuild_cv); + cv_destroy(&vd->vdev_rebuild_io_cv); + zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); + ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_rebuilding = svd->vdev_rebuilding; + tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; @@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; + svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) @@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd) * excise the DTLs. */ static boolean_t -vdev_dtl_should_excise(vdev_t *vd) +vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { - spa_t *spa = vd->vdev_spa; - dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd) if (vd->vdev_resilver_deferred) return (B_FALSE); - if (vd->vdev_resilver_txg == 0 || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); - /* - * When a resilver is initiated the scan will assign the scn_max_txg - * value to the highest txg value that exists in all DTLs. If this - * device's max DTL is not part of this scan (i.e. it is not in - * the range (scn_min_txg, scn_max_txg] then it is not eligible - * for excision. - */ - if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { - ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); - ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); - ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); - return (B_TRUE); + if (rebuild_done) { + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + /* Rebuild not initiated by attach */ + if (vd->vdev_rebuild_txg == 0) + return (B_TRUE); + + /* + * When a rebuild completes without error then all missing data + * up to the rebuild max txg has been reconstructed and the DTL + * is eligible for excision. + */ + if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && + vdev_dtl_max(vd) <= vrp->vrp_max_txg) { + ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); + ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); + return (B_TRUE); + } + } else { + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; + + /* Resilver not initiated by attach */ + if (vd->vdev_resilver_txg == 0) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the + * scn_max_txg value to the highest txg value that exists + * in all DTLs. If this device's max DTL is not part of this + * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] + * then it is not eligible for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); + return (B_TRUE); + } } + return (B_FALSE); } @@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd) * write operations will be issued to the pool. */ void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; @@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, - scrub_txg, scrub_done); + scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* - * If requested, pretend the scan completed cleanly. + * If requested, pretend the scan or rebuild completed cleanly. */ - if (zfs_scan_ignore_errors && scn) - scn->scn_phys.scn_errors = 0; + if (zfs_scan_ignore_errors) { + if (scn != NULL) + scn->scn_phys.scn_errors = 0; + if (vr != NULL) + vr->vr_rebuild_phys.vrp_errors = 0; + } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { @@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) } /* - * If we've completed a scan cleanly then determine - * if this vdev should remove any DTLs. We only want to - * excise regions on vdevs that were available during - * the entire duration of this scan. + * If we've completed a scrub/resilver or a rebuild cleanly + * then determine if this vdev should remove any DTLs. We + * only want to excise regions on vdevs that were available + * during the entire duration of this scan. */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || - (scn != NULL && scn->scn_phys.scn_errors == 0)) && - vdev_dtl_should_excise(vd)) { + if (rebuild_done && + vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { + check_excise = B_TRUE; + } else { + if (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) { + check_excise = B_TRUE; + } + } + + if (scrub_txg && check_excise && + vdev_dtl_should_excise(vd, rebuild_done)) { /* - * We completed a scrub up to scrub_txg. If we - * did it without rebooting, then the scrub dtl - * will be valid, so excise the old region and - * fold in the scrub dtl. Otherwise, leave the - * dtl as-is if there was an error. + * We completed a scrub, resilver or rebuild up to + * scrub_txg. If we did it without rebooting, then + * the scrub dtl will be valid, so excise the old + * region and fold in the scrub dtl. Otherwise, + * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference @@ -2711,15 +2776,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* - * If the vdev was resilvering and no longer has any - * DTLs then reset its resilvering flag and dirty + * If the vdev was resilvering or rebuilding and no longer + * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ - if (txg != 0 && vd->vdev_resilver_txg != 0 && + if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { - vd->vdev_resilver_txg = 0; - vdev_config_dirty(vd->vdev_top); + if (vd->vdev_rebuild_txg != 0) { + vd->vdev_rebuild_txg = 0; + vdev_config_dirty(vd->vdev_top); + } else if (vd->vdev_resilver_txg != 0) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } } mutex_exit(&vd->vdev_dtl_lock); @@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd) * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, @@ -3065,6 +3135,20 @@ vdev_load(vdev_t *vd) } } + /* + * Load any rebuild state from the top-level vdev zap. + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + error = vdev_rebuild_load(vd); + if (error && error != ENOTSUP) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " + "failed [error=%d]", error); + return (error); + } + } + /* * If this is a top-level vdev, initialize its metaslabs. */ @@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; @@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; + + /* Set when there is a deferred resilver. */ + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } + /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab @@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } + + /* + * Report fragmentation and rebuild progress for top-level, + * non-auxiliary, concrete devices. + */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } - if (vd->vdev_ops->vdev_op_leaf) - vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } vdev_get_stats_ex_impl(vd, vs, vsx); @@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { + /* + * Repair is the result of a resilver issued by the + * scan thread (spa_sync). + */ if (flags & ZIO_FLAG_SCAN_THREAD) { - dsl_scan_phys_t *scn_phys = - &spa->spa_dsl_pool->dp_scan->scn_phys; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; - /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } + /* + * Repair is the result of a rebuild issued by the + * rebuild thread (vdev_rebuild_thread). + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + vdev_t *tvd = vd->vdev_top; + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; + + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(rebuilt, psize); + vs->vs_rebuild_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; + zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as @@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; + /* + * Solely for the purposes of 'zpool iostat -lqrw' + * reporting use the priority to catagorize the IO. + * Only the following are reported to user space: + * + * ZIO_PRIORITY_SYNC_READ, + * ZIO_PRIORITY_SYNC_WRITE, + * ZIO_PRIORITY_ASYNC_READ, + * ZIO_PRIORITY_ASYNC_WRITE, + * ZIO_PRIORITY_SCRUB, + * ZIO_PRIORITY_TRIM. + */ + if (priority == ZIO_PRIORITY_REBUILD) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_SCRUB); + } else if (priority == ZIO_PRIORITY_INITIALIZING) { + ASSERT3U(type, ==, ZIO_TYPE_WRITE); + priority = ZIO_PRIORITY_ASYNC_WRITE; + } else if (priority == ZIO_PRIORITY_REMOVAL) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_ASYNC_READ); + } + vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { - vsx->vsx_agg_histo[zio->io_priority] + vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { - vsx->vsx_ind_histo[zio->io_priority] + vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { - vsx->vsx_queue_histo[zio->io_priority] + vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 81cfd5cce..8c7468255 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) } } +static void +top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + if (vd == vd->vdev_top) { + vdev_rebuild_stat_t vrs; + if (vdev_rebuild_get_stats(vd, &vrs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, + sizeof (vrs) / sizeof (uint64_t)); + } + } +} + /* * Generate the nvlist representing this vdev's config. */ @@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); + top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context @@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); + if (vd->vdev_rebuild_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 3edd65c01..094530e9b 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_abd, zio->io_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e31271dcb..a8ef3d747 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; uint32_t zfs_vdev_trim_min_active = 1; uint32_t zfs_vdev_trim_max_active = 2; +uint32_t zfs_vdev_rebuild_min_active = 1; +uint32_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_initializing_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_min_active); default: panic("invalid priority %u", p); return (0); @@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); return (0); @@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } } else if (zio->io_type == ZIO_TYPE_WRITE) { @@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && - zio->io_priority != ZIO_PRIORITY_INITIALIZING) { + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } } else { @@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, + "Max active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, + "Min active rebuild I/Os per vdev"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */ diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c new file mode 100644 index 000000000..bf1079fd7 --- /dev/null +++ b/module/zfs/vdev_rebuild.c @@ -0,0 +1,1106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains the sequential reconstruction implementation for + * resilvering. This form of resilvering is internally referred to as device + * rebuild to avoid conflating it with the traditional healing reconstruction + * performed by the dsl scan code. + * + * When replacing a device, or scrubbing the pool, ZFS has historically used + * a process called resilvering which is a form of healing reconstruction. + * This approach has the advantage that as blocks are read from disk their + * checksums can be immediately verified and the data repaired. Unfortunately, + * it also results in a random IO pattern to the disk even when extra care + * is taken to sequentialize the IO as much as possible. This substantially + * increases the time required to resilver the pool and restore redundancy. + * + * For mirrored devices it's possible to implement an alternate sequential + * reconstruction strategy when resilvering. Sequential reconstruction + * behaves like a traditional RAID rebuild and reconstructs a device in LBA + * order without verifying the checksum. After this phase completes a second + * scrub phase is started to verify all of the checksums. This two phase + * process will take longer than the healing reconstruction described above. + * However, it has that advantage that after the reconstruction first phase + * completes redundancy has been restored. At this point the pool can incur + * another device failure without risking data loss. + * + * There are a few noteworthy limitations and other advantages of resilvering + * using sequential reconstruction vs healing reconstruction. + * + * Limitations: + * + * - Only supported for mirror vdev types. Due to the variable stripe + * width used by raidz sequential reconstruction is not possible. + * + * - Block checksums are not verified during sequential reconstuction. + * Similar to traditional RAID the parity/mirror data is reconstructed + * but cannot be immediately double checked. For this reason when the + * last active resilver completes the pool is automatically scrubbed. + * + * - Deferred resilvers using sequential reconstruction are not currently + * supported. When adding another vdev to an active top-level resilver + * it must be restarted. + * + * Advantages: + * + * - Sequential reconstuction is performed in LBA order which may be faster + * than healing reconstuction particularly when using using HDDs (or + * especially with SMR devices). Only allocated capacity is resilvered. + * + * - Sequential reconstruction is not constrained by ZFS block boundaries. + * This allows it to issue larger IOs to disk which span multiple blocks + * allowing all of these logical blocks to be repaired with a single IO. + * + * - Unlike a healing resilver or scrub which are pool wide operations, + * sequential reconstruction is handled by the top-level mirror vdevs. + * This allows for it to be started or canceled on a top-level vdev + * without impacting any other top-level vdevs in the pool. + * + * - Data only referenced by a pool checkpoint will be repaired because + * that space is reflected in the space maps. This differs for a + * healing resilver or scrub which will not repair that data. + */ + + +/* + * Maximum number of queued rebuild I/Os top-level vdev. The number of + * concurrent rebuild I/Os issued to the device is controlled by the + * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module + * options. + */ +unsigned int zfs_rebuild_queue_limit = 20; + +/* + * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + */ +unsigned long zfs_rebuild_max_segment = 1024 * 1024; + +/* + * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). + */ +static void vdev_rebuild_thread(void *arg); + +/* + * Clear the per-vdev rebuild bytes value for a vdev tree. + */ +static void +clear_rebuild_bytes(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (uint64_t i = 0; i < vd->vdev_children; i++) + clear_rebuild_bytes(vd->vdev_child[i]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_rebuild_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Determines whether a vdev_rebuild_thread() should be stopped. + */ +static boolean_t +vdev_rebuild_should_stop(vdev_t *vd) +{ + return (!vdev_writeable(vd) || vd->vdev_removing || + vd->vdev_rebuild_exit_wanted || + vd->vdev_rebuild_cancel_wanted || + vd->vdev_rebuild_reset_wanted); +} + +/* + * Determine if the rebuild should be canceled. This may happen when all + * vdevs with MISSING DTLs are detached. + */ +static boolean_t +vdev_rebuild_should_cancel(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The sync task for updating the on-disk state of a rebuild. This is + * scheduled by vdev_rebuild_range(). + */ +static void +vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { + vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; + vr->vr_scan_offset[txg & TXG_MASK] = 0; + } + + vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Initialize the on-disk state for a new rebuild, start the rebuild thread. + */ +static void +vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_rebuilding); + + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + mutex_enter(&vd->vdev_rebuild_lock); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_start_time = gethrestime_sec(); + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* + * Rebuilds are currently only used when replacing a device, in which + * case there must be DTL_MISSING entries. In the future, we could + * allow rebuilds to be used in a way similar to a scrub. This would + * be useful because it would allow us to rebuild the space used by + * pool checkpoints. + */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu started", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) +{ + nvlist_t *aux = fnvlist_alloc(); + + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); + spa_event_notify(spa, vd, aux, name); + nvlist_free(aux); +} + +/* + * Called to request that a new rebuild be started. The feature will remain + * active for the duration of the rebuild, then revert to the enabled state. + */ +static void +vdev_rebuild_initiate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_top == vd); + ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); + ASSERT(!vd->vdev_rebuilding); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + vd->vdev_rebuilding = B_TRUE; + + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); +} + +/* + * Update the on-disk state to completed when a rebuild finishes. + */ +static void +vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu complete", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + /* Handles detaching of spares */ + spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Update the on-disk state to canceled when a rebuild finishes. + */ +static void +vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu canceled", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + vd->vdev_rebuild_cancel_wanted = B_FALSE; + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Resets the progress of a running rebuild. This will occur when a new + * vdev is added to rebuild. + */ +static void +vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + + vrp->vrp_last_offset = 0; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_bytes_scanned = 0; + vrp->vrp_bytes_issued = 0; + vrp->vrp_bytes_rebuilt = 0; + vrp->vrp_bytes_est = 0; + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* See vdev_rebuild_initiate_sync comment */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu reset", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + vd->vdev_rebuild_reset_wanted = B_FALSE; + ASSERT(vd->vdev_rebuilding); + + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Clear the last rebuild status. + */ +void +vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + objset_t *mos = spa_meta_objset(spa); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || + vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { + mutex_exit(&vd->vdev_rebuild_lock); + return; + } + + clear_rebuild_bytes(vd); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + + if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { + VERIFY0(zap_update(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + } + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * The zio_done_func_t callback for each rebuild I/O issued. It's responsible + * for updating the rebuild stats and limiting the number of in flight I/Os. + */ +static void +vdev_rebuild_cb(zio_t *zio) +{ + vdev_rebuild_t *vr = zio->io_private; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vdev_t *vd = vr->vr_top_vdev; + + mutex_enter(&vd->vdev_rebuild_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the top-level vdev was unavailable. + * Attempt to roll back to the last completed offset, in order + * resume from the correct location if the pool is resumed. + * (This works because spa_sync waits on spa_txg_zio before + * it runs sync tasks.) + */ + uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else if (zio->io_error) { + vrp->vrp_errors++; + } + + abd_free(zio->io_abd); + + ASSERT3U(vd->vdev_rebuild_inflight, >, 0); + vd->vdev_rebuild_inflight--; + cv_broadcast(&vd->vdev_rebuild_io_cv); + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * Rebuild the data in this range by constructing a special dummy block + * pointer for the given range. It has no relation to any existing blocks + * in the pool. But by disabling checksum verification and issuing a scrub + * I/O mirrored vdevs will replicate the block using any available mirror + * leaf vdevs. + */ +static void +vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, + uint64_t txg) +{ + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + uint64_t psize = asize; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + blkptr_t blk, *bp = &blk; + BP_ZERO(bp); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], start); + DVA_SET_GANG(&bp->blk_dva[0], 0); + DVA_SET_ASIZE(&bp->blk_dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + /* + * We increment the issued bytes by the asize rather than the psize + * so the scanned and issued bytes may be directly compared. This + * is consistent with the scrub/resilver issued reporting. + */ + vr->vr_pass_bytes_issued += asize; + vr->vr_rebuild_phys.vrp_bytes_issued += asize; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); +} + +/* + * Issues a rebuild I/O and takes care of rate limiting the number of queued + * rebuild I/Os. The provided start and size must be properly aligned for the + * top-level vdev type being rebuilt. + */ +static int +vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) +{ + uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + + ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); + ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); + + vr->vr_pass_bytes_scanned += size; + vr->vr_rebuild_phys.vrp_bytes_scanned += size; + + mutex_enter(&vd->vdev_rebuild_io_lock); + + /* Limit in flight rebuild I/Os */ + while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + vd->vdev_rebuild_inflight++; + mutex_exit(&vd->vdev_rebuild_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + /* This is the first I/O for this txg. */ + if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { + vr->vr_scan_offset[txg & TXG_MASK] = start; + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_rebuild_update_sync, + (void *)(uintptr_t)vd->vdev_id, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* When exiting write out our progress. */ + if (vdev_rebuild_should_stop(vd)) { + mutex_enter(&vd->vdev_rebuild_io_lock); + vd->vdev_rebuild_inflight--; + mutex_exit(&vd->vdev_rebuild_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_rebuild_lock); + + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vdev_rebuild_rebuild_block(vr, start, size, txg); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Split range into legally-sized logical chunks given the constraints of the + * top-level mirror vdev type. + */ +static uint64_t +vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) +{ + uint64_t chunk_size, max_asize, max_segment; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, + 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); + max_asize = vdev_psize_to_asize(vd, max_segment); + chunk_size = MIN(size, max_asize); + + return (chunk_size); +} + +/* + * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. + */ +static int +vdev_rebuild_ranges(vdev_rebuild_t *vr) +{ + vdev_t *vd = vr->vr_top_vdev; + zfs_btree_t *t = &vr->vr_scan_tree->rt_root; + zfs_btree_index_t idx; + int error; + + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t start = rs_get_start(rs, vr->vr_scan_tree); + uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; + + /* + * zfs_scan_suspend_progress can be set to disable rebuild + * progress for testing. See comment in dsl_scan_sync(). + */ + while (zfs_scan_suspend_progress && + !vdev_rebuild_should_stop(vd)) { + delay(hz); + } + + while (size > 0) { + uint64_t chunk_size; + + chunk_size = vdev_rebuild_chunk_size(vd, start, size); + + error = vdev_rebuild_range(vr, start, chunk_size); + if (error != 0) + return (error); + + size -= chunk_size; + start += chunk_size; + } + } + + return (0); +} + +/* + * Calculates the estimated capacity which remains to be scanned. Since + * we traverse the pool in metaslab order only allocated capacity beyond + * the vrp_last_offset need be considered. All lower offsets must have + * already been rebuilt and are thus already included in vrp_bytes_scanned. + */ +static void +vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t bytes_est = vrp->vrp_bytes_scanned; + + if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) + return; + + for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + + mutex_enter(&msp->ms_lock); + bytes_est += metaslab_allocated_space(msp); + mutex_exit(&msp->ms_lock); + } + + vrp->vrp_bytes_est = bytes_est; +} + +/* + * Load from disk the top-level vdev's rebuild information. + */ +int +vdev_rebuild_load(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + spa_t *spa = vd->vdev_spa; + int err = 0; + + mutex_enter(&vd->vdev_rebuild_lock); + vd->vdev_rebuilding = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + mutex_exit(&vd->vdev_rebuild_lock); + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(vd->vdev_top == vd); + + err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp); + + /* + * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should + * not prevent a pool from being imported. Clear the rebuild + * status allowing a new resilver/rebuild to be started. + */ + if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + } else if (err) { + mutex_exit(&vd->vdev_rebuild_lock); + return (err); + } + + vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; + vr->vr_top_vdev = vd; + + mutex_exit(&vd->vdev_rebuild_lock); + + return (0); +} + +/* + * Each scan thread is responsible for rebuilding a top-level vdev. The + * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. + */ +static void +vdev_rebuild_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + /* + * If there's a scrub in process request that it be stopped. This + * is not required for a correct rebuild, but we do want rebuilds to + * emulate the resilver behavior as much as possible. + */ + dsl_pool_t *dsl = spa_get_dsl(spa); + if (dsl_scan_scrubbing(dsl)) + dsl_scan_cancel(dsl); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); + ASSERT(vd->vdev_rebuilding); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); + ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); + ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); + + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vr->vr_top_vdev = vd; + vr->vr_scan_msp = NULL; + vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + vr->vr_pass_start_time = gethrtime(); + vr->vr_pass_bytes_scanned = 0; + vr->vr_pass_bytes_issued = 0; + + uint64_t update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, 0); + + clear_rebuild_bytes(vr->vr_top_vdev); + + mutex_exit(&vd->vdev_rebuild_lock); + + /* + * Systematically walk the metaslabs and issue rebuild I/Os for + * all ranges in the allocated space map. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + vr->vr_scan_msp = msp; + + /* + * Removal of vdevs from the vdev tree may eliminate the need + * for the rebuild, in which case it should be canceled. The + * vdev_rebuild_cancel_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (vdev_rebuild_should_cancel(vd)) { + vd->vdev_rebuild_cancel_wanted = B_TRUE; + error = EINTR; + break; + } + + ASSERT0(range_tree_space(vr->vr_scan_tree)); + + /* + * Disable any new allocations to this metaslab and wait + * for any writes inflight to complete. This is needed to + * ensure all allocated ranges are rebuilt. + */ + metaslab_disable(msp); + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(dsl, 0); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * When a metaslab has been allocated from read its allocated + * ranges from the space map object in to the vr_scan_tree. + * Then add inflight / unflushed ranges and remove inflight / + * unflushed frees. This is the minimum range to be rebuilt. + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + vr->vr_scan_tree, SM_ALLOC)); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space( + msp->ms_allocating[i])); + } + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, vr->vr_scan_tree); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, vr->vr_scan_tree); + + /* + * Remove ranges which have already been rebuilt based + * on the last offset. This can happen when restarting + * a scan after exporting and re-importing the pool. + */ + range_tree_clear(vr->vr_scan_tree, 0, + vrp->vrp_last_offset); + } + + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + /* + * To provide an accurate estimate re-calculate the estimated + * size every 5 minutes to account for recent allocations and + * frees made space maps which have not yet been rebuilt. + */ + if (gethrtime() > update_est_time + SEC2NSEC(300)) { + update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, i); + } + + /* + * Walk the allocated space map and issue the rebuild I/O. + */ + error = vdev_rebuild_ranges(vr); + range_tree_vacate(vr->vr_scan_tree, NULL, NULL); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + metaslab_enable(msp, B_FALSE, B_FALSE); + + if (error != 0) + break; + } + + range_tree_destroy(vr->vr_scan_tree); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* Wait for any remaining rebuild I/O to complete */ + mutex_enter(&vd->vdev_rebuild_io_lock); + while (vd->vdev_rebuild_inflight > 0) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + dsl_pool_t *dp = spa_get_dsl(spa); + dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (error == 0) { + /* + * After a successful rebuild clear the DTLs of all ranges + * which were missing when the rebuild was started. These + * ranges must have been rebuilt as a consequence of rebuilding + * all allocated space. Note that unlike a scrub or resilver + * the rebuild operation will reconstruct data only referenced + * by a pool checkpoint. See the dsl_scan_done() comments. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_cancel_wanted) { + /* + * The rebuild operation was canceled. This will occur when + * a device participating in the rebuild is detached. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_reset_wanted) { + /* + * Reset the running rebuild without canceling and restarting + * it. This will occur when a new device is attached and must + * participate in the rebuild. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else { + /* + * The rebuild operation should be suspended. This may occur + * when detaching a child vdev or when exporting the pool. The + * rebuild is left in the active state so it will be resumed. + */ + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + vd->vdev_rebuilding = B_FALSE; + } + + dmu_tx_commit(tx); + + vd->vdev_rebuild_thread = NULL; + mutex_exit(&vd->vdev_rebuild_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Returns B_TRUE if any top-level vdev are rebuilding. + */ +boolean_t +vdev_rebuild_active(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + boolean_t ret = B_FALSE; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + ret = vdev_rebuild_active(vd->vdev_child[i]); + if (ret) + return (ret); + } + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + mutex_exit(&vd->vdev_rebuild_lock); + } + + return (ret); +} + +/* + * Start a rebuild operation. The rebuild may be restarted when the + * top-level vdev is currently actively rebuilding. + */ +void +vdev_rebuild(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_top == vd); + ASSERT(vdev_is_concrete(vd)); + ASSERT(!vd->vdev_removing); + ASSERT(spa_feature_is_enabled(vd->vdev_spa, + SPA_FEATURE_DEVICE_REBUILD)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuilding) { + ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); + + /* + * Signal a running rebuild operation that it should restart + * from the beginning because a new device was attached. The + * vdev_rebuild_reset_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (!vd->vdev_rebuild_reset_wanted) + vd->vdev_rebuild_reset_wanted = B_TRUE; + } else { + vdev_rebuild_initiate(vd); + } + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_restart_impl(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_restart_impl(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && + vdev_writeable(vd) && !vd->vdev_rebuilding) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DEVICE_REBUILD)); + vd->vdev_rebuilding = B_TRUE; + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, + maxclsyspri); + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Conditionally restart all of the vdev_rebuild_thread's for a pool. The + * feature flag must be active and the rebuild in the active state. This + * cannot be used to start a new rebuild. + */ +void +vdev_rebuild_restart(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vdev_rebuild_restart_impl(spa->spa_root_vdev); +} + +/* + * Stop and wait for all of the vdev_rebuild_thread's associated with the + * vdev tree provide to be terminated (canceled or stopped). + */ +void +vdev_rebuild_stop_wait(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_stop_wait(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuild_thread != NULL) { + vd->vdev_rebuild_exit_wanted = B_TRUE; + while (vd->vdev_rebuilding) { + cv_wait(&vd->vdev_rebuild_cv, + &vd->vdev_rebuild_lock); + } + vd->vdev_rebuild_exit_wanted = B_FALSE; + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Stop all rebuild operations but leave them in the active state so they + * will be resumed when importing the pool. + */ +void +vdev_rebuild_stop_all(spa_t *spa) +{ + vdev_rebuild_stop_wait(spa->spa_root_vdev); +} + +/* + * Rebuild statistics reported per top-level vdev. + */ +int +vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) +{ + spa_t *spa = tvd->vdev_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (SET_ERROR(ENOTSUP)); + + if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) + return (SET_ERROR(EINVAL)); + + int error = zap_contains(spa_meta_objset(spa), + tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); + + if (error == ENOENT) { + bzero(vrs, sizeof (vdev_rebuild_stat_t)); + vrs->vrs_state = VDEV_REBUILD_NONE; + error = 0; + } else if (error == 0) { + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&tvd->vdev_rebuild_lock); + vrs->vrs_state = vrp->vrp_rebuild_state; + vrs->vrs_start_time = vrp->vrp_start_time; + vrs->vrs_end_time = vrp->vrp_end_time; + vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; + vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; + vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; + vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; + vrs->vrs_bytes_est = vrp->vrp_bytes_est; + vrs->vrs_errors = vrp->vrp_errors; + vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - + vr->vr_pass_start_time); + vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; + vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + mutex_exit(&tvd->vdev_rebuild_lock); + } + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, + "Max segment size in bytes of rebuild reads"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 4122114b5..1d2ae6270 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1938,8 +1938,9 @@ static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; - int replacing = zc->zc_cookie; nvlist_t *config; + int replacing = zc->zc_cookie; + int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) @@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { - error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, + rebuild); nvlist_free(config); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 765ffea8a..f6478dd0d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -487,7 +487,8 @@ tests = ['zpool_wait_discard', 'zpool_wait_freeing', tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] -tests = ['zpool_wait_replace_cancel', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', +tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', + 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] @@ -748,7 +749,11 @@ tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] -tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos'] +tests = ['attach_import', 'attach_multiple', 'attach_rebuild', + 'attach_resilver', 'detach', 'rebuild_disabled_feature', + 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', + 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', + 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] @@ -762,10 +767,6 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_022_pos'] tags = ['functional', 'reservation'] -[tests/functional/resilver] -tests = ['resilver_restart_001', 'resilver_restart_002'] -tags = ['functional', 'resilver'] - [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 9fbcc37c6..5e07cda4d 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2222,26 +2222,27 @@ function check_pool_status # pool token keyword if [[ $verbose == true ]]; then log_note $scan fi - echo $scan | grep -i "$keyword" > /dev/null 2>&1 + echo $scan | egrep -i "$keyword" > /dev/null 2>&1 return $? } # # The following functions are instance of check_pool_status() -# is_pool_resilvering - to check if the pool is resilver in progress -# is_pool_resilvered - to check if the pool is resilver completed -# is_pool_scrubbing - to check if the pool is scrub in progress -# is_pool_scrubbed - to check if the pool is scrub completed -# is_pool_scrub_stopped - to check if the pool is scrub stopped -# is_pool_scrub_paused - to check if the pool has scrub paused -# is_pool_removing - to check if the pool is removing a vdev -# is_pool_removed - to check if the pool is remove completed -# is_pool_discarding - to check if the pool has checkpoint being discarded +# is_pool_resilvering - to check if the pool resilver is in progress +# is_pool_resilvered - to check if the pool resilver is completed +# is_pool_scrubbing - to check if the pool scrub is in progress +# is_pool_scrubbed - to check if the pool scrub is completed +# is_pool_scrub_stopped - to check if the pool scrub is stopped +# is_pool_scrub_paused - to check if the pool scrub has paused +# is_pool_removing - to check if the pool removing is a vdev +# is_pool_removed - to check if the pool remove is completed +# is_pool_discarding - to check if the pool checkpoint is being discarded # function is_pool_resilvering #pool { - check_pool_status "$1" "scan" "resilver in progress since " $2 + check_pool_status "$1" "scan" \ + "resilver[ ()0-9A-Za-z_-]* in progress since" $2 return $? } @@ -3487,7 +3488,7 @@ function wait_scrubbed typeset pool=${1:-$TESTPOOL} while true ; do is_pool_scrubbed $pool && break - log_must sleep 1 + sleep 1 done } diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index 24f3e50bb..c56518c55 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -65,7 +65,6 @@ SUBDIRS = \ rename_dirs \ replacement \ reservation \ - resilver \ rootpool \ rsend \ scrub_mirror \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index ee5b2b4e1..4991b76bf 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -79,6 +79,7 @@ typeset -a properties=( "feature@redacted_datasets" "feature@bookmark_written" "feature@log_spacemap" + "feature@device_rebuild" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am index 6a21cac4f..451d83a79 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am @@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \ cleanup.ksh \ zpool_wait_replace.ksh \ zpool_wait_replace_cancel.ksh \ + zpool_wait_rebuild.ksh \ zpool_wait_resilver.ksh \ zpool_wait_scrub_basic.ksh \ zpool_wait_scrub_cancel.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh new file mode 100755 index 000000000..8cd586459 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib + +# +# DESCRIPTION: +# 'zpool wait' works when waiting for sequential resilvering to complete. +# +# STRATEGY: +# 1. Attach a device to the pool so that sequential resilvering starts. +# 2. Start 'zpool wait'. +# 3. Monitor the waiting process to make sure it returns neither too soon nor +# too late. +# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using +# 'zpool wait'. +# + +function cleanup +{ + remove_io_delay + kill_if_running $pid + get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \ + log_must zpool detach $TESTPOOL $DISK2 +} + +typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL" +typeset pid + +log_onexit cleanup + +add_io_delay $TESTPOOL + +# Test 'zpool wait -t resilver' +log_must zpool attach -s $TESTPOOL $DISK1 $DISK2 +log_bkgrnd zpool wait -t resilver $TESTPOOL +pid=$! +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_must zpool detach $TESTPOOL $DISK2 + +# Test 'zpool attach -w' +log_bkgrnd zpool attach -sw $TESTPOOL $DISK1 $DISK2 +pid=$! +while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do + log_must sleep .5 +done +check_while_waiting $pid "$IN_PROGRESS_CHECK" + +log_pass "'zpool wait -t resilver' and 'zpool attach -w' work." diff --git a/tests/zfs-tests/tests/functional/replacement/Makefile.am b/tests/zfs-tests/tests/functional/replacement/Makefile.am index d47fcd5e1..fe6e49121 100644 --- a/tests/zfs-tests/tests/functional/replacement/Makefile.am +++ b/tests/zfs-tests/tests/functional/replacement/Makefile.am @@ -2,9 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/replacement dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - replacement_001_pos.ksh \ - replacement_002_pos.ksh \ - replacement_003_pos.ksh + attach_import.ksh \ + attach_multiple.ksh \ + attach_rebuild.ksh \ + attach_resilver.ksh \ + detach.ksh \ + rebuild_disabled_feature.ksh \ + rebuild_multiple.ksh \ + rebuild_raidz.ksh \ + replace_import.ksh \ + replace_rebuild.ksh \ + replace_resilver.ksh \ + resilver_restart_001.ksh \ + resilver_restart_002.ksh \ + scrub_cancel.ksh dist_pkgdata_DATA = \ replacement.cfg diff --git a/tests/zfs-tests/tests/functional/replacement/attach_import.ksh b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh new file mode 100755 index 000000000..e2749b164 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that on import an in progress attach operation is resumed. +# +# Strategy: +# 1. For both healing and sequential resilvering. +# a. Create a pool +# b. Add a vdev with 'zpool attach' and resilver (-s) it. +# c. Export the pool +# d. Import the pool +# e. Verify the 'zpool attach' resumed resilvering +# f. Destroy the pool +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} +} + +log_assert "Verify attach is resumed on import" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} + +# Verify healing and sequential resilver resume on import. +for arg in "" "-s"; do + log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]} + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + log_must zpool attach $arg $TESTPOOL1 ${VDEV_FILES[0]} ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1 + log_must is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + destroy_pool $TESTPOOL1 +done + +log_pass "Verify attach is resumed on import" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh new file mode 100755 index 000000000..b3192b2bf --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that attach/detach work while resilvering and attaching +# multiple vdevs. +# +# Strategy: +# 1. Create a single vdev pool +# 2. While healing or sequential resilvering: +# a. Attach a vdev to convert the pool to a mirror. +# b. Attach a vdev to convert the pool to a 3-way mirror. +# c. Verify the original vdev cannot be removed (no redundant copies) +# d. Detach a vdev. Healing and sequential resilver remain running. +# e. Detach a vdev. Healing resilver remains running, sequential +# resilver is canceled. +# f. Wait for resilver to complete. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} +} + +log_assert "Verify attach/detech with multiple vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} + +# Verify resilver resumes on import. +log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]} + +for replace_mode in "healing" "sequential"; do + # + # Resilvers abort the dsl_scan and reconfigure it for resilvering. + # Rebuilds cancel the dsl_scan and start the vdev_rebuild thread. + # + if [[ "$replace_mode" = "healing" ]]; then + flags="" + else + flags="-s" + fi + + log_mustnot is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # Attach first vdev (stripe -> mirror) + log_must zpool attach $flags $TESTPOOL1 \ + ${VDEV_FILES[0]} ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + + # Attach second vdev (2-way -> 3-way mirror) + log_must zpool attach $flags $TESTPOOL1 \ + ${VDEV_FILES[1]} ${VDEV_FILES[2]} + log_must is_pool_resilvering $TESTPOOL1 + + # Original vdev cannot be detached until there is sufficent redundancy. + log_mustnot zpool detach $TESTPOOL1 ${VDEV_FILES[0]} + + # Detach first vdev (resilver keeps running) + log_must zpool detach $TESTPOOL1 ${VDEV_FILES[1]} + log_must is_pool_resilvering $TESTPOOL1 + + # + # Detach second vdev. There's a difference in behavior between + # healing and sequential resilvers. A healing resilver will not be + # cancelled even though there's nothing on the original vdev which + # needs to be rebuilt. A sequential resilver on the otherhand is + # canceled when returning to a non-redundant striped layout. At + # some point the healing resilver behavior should be updated to match + # the sequential resilver behavior. + # + log_must zpool detach $TESTPOOL1 ${VDEV_FILES[2]} + + if [[ "$replace_mode" = "healing" ]]; then + log_must is_pool_resilvering $TESTPOOL1 + else + log_mustnot is_pool_resilvering $TESTPOOL1 + fi + + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait $TESTPOOL1 +done + +log_pass "Verify attach/detech with multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh new file mode 100755 index 000000000..e9427c7ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh @@ -0,0 +1,173 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Attaching disks during I/O should pass for supported pools. +# +# STRATEGY: +# 1. Create multidisk pools (stripe/mirror/raidz) and +# start some random I/O +# 2. Attach a disk to the pool. +# 3. Verify the integrity of the file system and the resilvering. +# +# NOTE: Raidz does not support the sequential resilver (-s) option. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids; do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function attach_test +{ + typeset -i iters=2 + typeset -i index=0 + typeset opt=$1 + typeset disk1=$2 + typeset disk2=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + log_must zpool attach -sw $opt $TESTPOOL1 $disk1 $disk2 + + for wait_pid in $child_pids; do + kill $wait_pid + done + child_pids="" + + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +specials_list="" +i=0 +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +# +# Create a replacement disk special file. +# +truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE + +for op in "" "-f"; do + create_pool $TESTPOOL1 mirror $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -ne 0 ]]; then + log_fail "$REPLACEFILE is not present." + fi + + destroy_pool $TESTPOOL1 +done + +log_note "Verify 'zpool attach' fails with non-mirrors." + +for type in "" "raidz" "raidz1"; do + for op in "" "-f"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_mustnot zpool attach -s "$opt" $TESTDIR/$TESTFILE1.1 \ + $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -eq 0 ]]; then + log_fail "$REPLACEFILE should not be present." + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh similarity index 92% rename from tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh rename to tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh index 391aa5cf0..4261d4d67 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh @@ -104,9 +104,7 @@ function attach_test ((i = i + 1)) done - log_must zpool attach $opt $TESTPOOL1 $disk1 $disk2 - - sleep 10 + log_must zpool attach -w $opt $TESTPOOL1 $disk1 $disk2 for wait_pid in $child_pids do @@ -119,13 +117,13 @@ function attach_test log_must zfs umount $TESTPOOL1/$TESTFS1 log_must zdb -cdui $TESTPOOL1/$TESTFS1 log_must zfs mount $TESTPOOL1/$TESTFS1 - + verify_pool $TESTPOOL1 } specials_list="" i=0 -while [[ $i != 2 ]]; do - mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -134,7 +132,7 @@ done # # Create a replacement disk special file. # -mkfile $MINVDEVSIZE $TESTDIR/$REPLACEFILE +truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE for op in "" "-f"; do create_pool $TESTPOOL1 mirror $specials_list @@ -143,7 +141,7 @@ for op in "" "-f"; do attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -ne 0 ]]; then log_fail "$REPLACEFILE is not present." fi @@ -162,7 +160,7 @@ for type in "" "raidz" "raidz1"; do log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \ $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -eq 0 ]]; then log_fail "$REPLACEFILE should not be present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh b/tests/zfs-tests/tests/functional/replacement/detach.ksh similarity index 94% rename from tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh rename to tests/zfs-tests/tests/functional/replacement/detach.ksh index 71b9602ee..aa3ec4f7a 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/detach.ksh @@ -121,8 +121,8 @@ function detach_test specials_list="" i=0 -while [[ $i != 2 ]]; do - mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i +while [[ $i != 3 ]]; do + truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" ((i = i + 1)) @@ -134,7 +134,7 @@ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 detach_test $TESTDIR/$TESTFILE1.1 -zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1" +zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1" if [[ $? -eq 0 ]]; then log_fail "$TESTFILE1.1 should no longer be present." fi @@ -143,14 +143,14 @@ destroy_pool $TESTPOOL1 log_note "Verify 'zpool detach' fails with non-mirrors." -for type in "" "raidz" "raidz1" ; do +for type in "" "raidz" "raidz1"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 log_mustnot zpool detach $TESTDIR/$TESTFILE1.1 - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1" + zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1" if [[ $? -ne 0 ]]; then log_fail "$TESTFILE1.1 is not present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh new file mode 100755 index 000000000..d17d83b78 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify device_rebuild feature flags. +# +# Strategy: +# 1. Create a pool with all features disabled. +# 2. Verify 'zpool replace -s' fails and the feature is disabled. +# 3. Enable the device_rebuild feature. +# 4. Verify 'zpool replace -s' works and the feature is active. +# 5. Wait for the feature to return to enabled. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +function check_feature_flag +{ + feature=$1 + pool=$2 + expected_value=$3 + + value="$(zpool get -H -o property,value all $pool | \ + egrep "$feature" | awk '{print $2}')" + if [ "$value" = "$expected_value" ]; then + log_note "$feature verified to be $value" + else + log_fail "$feature should be $expected_value but is $value" + fi +} + +log_assert "Verify device_rebuild feature flags." + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE +log_must zpool create -d $TESTPOOL1 ${VDEV_FILES[@]} + +log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "disabled" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool set feature@device_rebuild=enabled $TESTPOOL1 +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "active" + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 +check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "enabled" + +log_pass "Verify device_rebuild feature flags." diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh new file mode 100755 index 000000000..7775cbff4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Sequential reconstruction (unlike healing reconstruction) operate on the +# top-level vdev. This means that a sequential resilver operation can be +# started/stopped on a different top-level vdev without impacting other +# sequential resilvers. +# +# STRATEGY: +# 1. Create a mirrored pool. +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE $SPARE_VDEV_FILE2 +} + +function check_history +{ + pool=$1 + msg=$2 + exp=$3 + + count=$(zpool history -i $pool | grep "rebuild" | grep -c "$msg") + if [[ "$count" -ne "$exp" ]]; then + log_fail "Expected $exp rebuild '$msg' messages, found $count" + else + log_note "Found $count/$exp rebuild '$msg' messages" + fi +} + +log_assert "Rebuilds operate on the top-level vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} \ + $SPARE_VDEV_FILE $SPARE_VDEV_FILE2 + +# Verify two sequential resilvers can run concurrently. +log_must zpool create -f $TESTPOOL1 \ + mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \ + mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32 +log_must zpool sync $TESTPOOL1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "started" 2 +check_history $TESTPOOL1 "reset" 0 +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 0 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 + +check_history $TESTPOOL1 "complete" 2 +destroy_pool $TESTPOOL1 + +# Verify canceling one resilver (zpool detach) does not impact others. +log_must zpool create -f $TESTPOOL1 \ + mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \ + mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32 +log_must zpool sync $TESTPOOL1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "started" 2 +check_history $TESTPOOL1 "reset" 0 +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 0 + +log_must zpool detach $TESTPOOL1 $SPARE_VDEV_FILE2 + +check_history $TESTPOOL1 "complete" 0 +check_history $TESTPOOL1 "canceled" 1 + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS +log_must zpool wait -t resilver $TESTPOOL1 + +check_history $TESTPOOL1 "complete" 1 +check_history $TESTPOOL1 "canceled" 1 +destroy_pool $TESTPOOL1 + +log_pass "Rebuilds operate on the top-level vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh new file mode 100755 index 000000000..c919b44b2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Executing 'zpool replace -s' for raidz vdevs failed. Sequential +# resilvers are only allowed for stripe/mirror pools. +# +# STRATEGY: +# 1. Create a raidz pool, verify 'zpool replace -s' fails +# 2. Create a stripe/mirror pool, verify 'zpool replace -s' passes +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Sequential resilver is not allowed for raidz vdevs" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +# raidz[1-3] +for vdev_type in "raidz" "raidz2" "raidz3"; do + log_must zpool create -f $TESTPOOL1 $vdev_type ${VDEV_FILES[@]} + log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} \ + $SPARE_VDEV_FILE + destroy_pool $TESTPOOL1 +done + +# stripe +log_must zpool create $TESTPOOL1 ${VDEV_FILES[@]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +# mirror +log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + +log_pass "Sequential resilver is not allowed for raidz vdevs" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_import.ksh b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh new file mode 100755 index 000000000..35d51d939 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# Description: +# Verify that on import an in progress replace operation is resumed. +# +# Strategy: +# 1. For both healing and sequential resilvering replace: +# a. Create a pool +# b. Repalce a vdev with 'zpool replace' to resilver (-s) it. +# c. Export the pool +# d. Import the pool +# e. Verify the 'zpool replace' resumed resilvering. +# f. Destroy the pool +# + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Verify replace is resumed on import" + +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +# Verify healing and sequential resilver resume on import. +for arg in "" "-s"; do + log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]} + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE + log_must is_pool_resilvering $TESTPOOL1 + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1 + log_must is_pool_resilvering $TESTPOOL1 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 + destroy_pool $TESTPOOL1 +done + +log_pass "Verify replace is resumed on import" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh new file mode 100755 index 000000000..599735228 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh @@ -0,0 +1,158 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Replacing disks during I/O should pass for supported pools. +# +# STRATEGY: +# 1. Create multidisk pools (stripe/mirror) and +# start some random I/O +# 2. Replace a disk in the pool with another disk. +# 3. Verify the integrity of the file system and the rebuilding. +# +# NOTE: Raidz does not support the sequential resilver (-s) option. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids + do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk with -r during I/O completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function replace_test +{ + typeset -i iters=2 + typeset -i index=0 + typeset opt=$1 + typeset disk1=$2 + typeset disk2=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + log_must zpool replace -sw $opt $TESTPOOL1 $disk1 $disk2 + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +specials_list="" +i=0 +while [[ $i != 3 ]]; do + log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +# +# Create a replacement disk special file. +# +log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE + +for type in "" "mirror"; do + for op in "" "-f"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" + if [[ $? -ne 0 ]]; then + log_fail "$REPLACEFILE is not present." + fi + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 + done +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh similarity index 95% rename from tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh rename to tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh index 8f40436ff..253cf65e4 100755 --- a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh @@ -104,9 +104,7 @@ function replace_test ((i = i + 1)) done - log_must zpool replace $opt $TESTPOOL1 $disk1 $disk2 - - sleep 10 + log_must zpool replace -w $opt $TESTPOOL1 $disk1 $disk2 for wait_pid in $child_pids do @@ -119,11 +117,12 @@ function replace_test log_must zfs umount $TESTPOOL1/$TESTFS1 log_must zdb -cdui $TESTPOOL1/$TESTFS1 log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 } specials_list="" i=0 -while [[ $i != 2 ]]; do +while [[ $i != 3 ]]; do log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" @@ -143,7 +142,7 @@ for type in "" "raidz" "mirror"; do replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE - zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE" + zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE" if [[ $? -ne 0 ]]; then log_fail "$REPLACEFILE is not present." fi diff --git a/tests/zfs-tests/tests/functional/replacement/replacement.cfg b/tests/zfs-tests/tests/functional/replacement/replacement.cfg index b2ba1b885..271317b1c 100644 --- a/tests/zfs-tests/tests/functional/replacement/replacement.cfg +++ b/tests/zfs-tests/tests/functional/replacement/replacement.cfg @@ -36,3 +36,8 @@ export HOLES_SEED=${HOLES_SEED-""} export HOLES_FILEOFFSET=${HOLES_FILEOFFSET-""} export HOLES_COUNT=${HOLES_COUNT-"16384"} # FILESIZE/BLKSIZE/8 export REPLACEFILE="sparedisk" + +set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4} +export VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 )) +export SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1 +export SPARE_VDEV_FILE2=$TEST_BASE_DIR/spare-2 diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh similarity index 88% rename from tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh rename to tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh index 9af1c972f..7896b2dbe 100755 --- a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh @@ -20,7 +20,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg +. $STF_SUITE/tests/functional/replacement/replacement.cfg # # DESCRIPTION: @@ -50,7 +50,7 @@ function cleanup $ORIG_SCAN_SUSPEND_PROGRESS log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX log_must zinject -c all - destroy_pool $TESTPOOL + destroy_pool $TESTPOOL1 rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE } @@ -70,7 +70,7 @@ function verify_restarts # [[ -z "$defer" ]] && return # use zdb to find which vdevs have the resilver defer flag - VDEV_DEFERS=$(zdb -C $TESTPOOL | awk ' + VDEV_DEFERS=$(zdb -C $TESTPOOL1 | awk ' /children/ { gsub(/[^0-9]/, ""); child = $0 } /com\.datto:resilver_defer$/ { print child } ') @@ -106,17 +106,17 @@ log_must set_tunable32 ZEVENT_LEN_MAX 512 log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE -log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL \ +log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL1 \ raidz ${VDEV_FILES[@]} # create 4 filesystems for fs in fs{0..3} do - log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL/$fs + log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL1/$fs done # simultaneously write 16M to each of them -set -A DATAPATHS /$TESTPOOL/fs{0..3}/dat.0 +set -A DATAPATHS /$TESTPOOL1/fs{0..3}/dat.0 log_note "Writing data files" for path in ${DATAPATHS[@]} do @@ -131,7 +131,7 @@ do if [[ $test == "with" ]] then - log_must zpool set feature@resilver_defer=enabled $TESTPOOL + log_must zpool set feature@resilver_defer=enabled $TESTPOOL1 RESTARTS=( "${DEFER_RESTARTS[@]}" ) VDEVS=( "${DEFER_VDEVS[@]}" ) VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}" @@ -144,7 +144,7 @@ do log_must set_tunable32 RESILVER_MIN_TIME_MS 50 # initiate a resilver and suspend the scan as soon as possible - log_must zpool replace $TESTPOOL $VDEV_REPLACE + log_must zpool replace $TESTPOOL1 $VDEV_REPLACE log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 # there should only be 1 resilver start @@ -152,16 +152,16 @@ do # offline then online a vdev to introduce a new DTL range after current # scan, which should restart (or defer) the resilver - log_must zpool offline $TESTPOOL ${VDEV_FILES[2]} - log_must zpool sync $TESTPOOL - log_must zpool online $TESTPOOL ${VDEV_FILES[2]} - log_must zpool sync $TESTPOOL + log_must zpool offline $TESTPOOL1 ${VDEV_FILES[2]} + log_must zpool sync $TESTPOOL1 + log_must zpool online $TESTPOOL1 ${VDEV_FILES[2]} + log_must zpool sync $TESTPOOL1 # there should now be 2 resilver starts w/o defer, 1 with defer verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}" # inject read io errors on vdev and verify resilver does not restart - log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL + log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL1 log_must cat ${DATAPATHS[1]} > /dev/null log_must zinject -c all @@ -173,17 +173,12 @@ do log_must set_tunable32 RESILVER_MIN_TIME_MS 3000 # wait for resilver to finish - for iter in {0..59} - do - is_pool_resilvered $TESTPOOL && break - sleep 1 - done - is_pool_resilvered $TESTPOOL || - log_fail "resilver timed out" + log_must zpool wait -t resilver $TESTPOOL1 + log_must is_pool_resilvered $TESTPOOL1 # wait for a few txg's to see if a resilver happens - log_must zpool sync $TESTPOOL - log_must zpool sync $TESTPOOL + log_must zpool sync $TESTPOOL1 + log_must zpool sync $TESTPOOL1 # there should now be 2 resilver starts verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}" diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh similarity index 80% rename from tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh rename to tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh index ebe5e693b..48763f9b2 100755 --- a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh +++ b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh @@ -20,7 +20,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg +. $STF_SUITE/tests/functional/replacement/replacement.cfg # # DESCRIPTION: @@ -40,7 +40,7 @@ function cleanup { log_must zinject -c all - destroy_pool $TESTPOOL + destroy_pool $TESTPOOL1 rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY } @@ -56,25 +56,25 @@ log_must set_tunable32 SCAN_LEGACY 1 # create the pool and a 32M file (32k blocks) log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE -log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]} -log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1 +log_must zpool create -f -O recordsize=1k $TESTPOOL1 ${VDEV_FILES[0]} +log_must dd if=/dev/urandom of=/$TESTPOOL1/file bs=1M count=32 > /dev/null 2>&1 # determine objset/object -objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') -object=$(ls -i /$TESTPOOL/file | awk '{print $1}') +objset=$(zdb -d $TESTPOOL1/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') +object=$(ls -i /$TESTPOOL1/file | awk '{print $1}') # inject event to cause error during resilver -log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL +log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL1 # clear events and start resilver log_must zpool events -c -log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE +log_must zpool attach $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE log_note "waiting for read errors to start showing up" for iter in {0..59} do - zpool sync $TESTPOOL - err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}') + zpool sync $TESTPOOL1 + err=$(zpool status $TESTPOOL1 | grep ${VDEV_FILES[0]} | awk '{print $3}') (( $err > 0 )) && break sleep 1 done @@ -92,8 +92,8 @@ done (( $finish == 0 )) && log_fail "resilver took too long to finish" # wait a few syncs to ensure that zfs does not restart the resilver -log_must zpool sync $TESTPOOL -log_must zpool sync $TESTPOOL +log_must zpool sync $TESTPOOL1 +log_must zpool sync $TESTPOOL1 # check if resilver was restarted start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) diff --git a/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh new file mode 100755 index 000000000..da8a0a26e --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Verify scrub behaves as intended when contending with a healing or +# sequential resilver. +# +# STRATEGY: +# 1. Create a pool +# 2. Add a modest amount of data to the pool. +# 3. For healing and sequential resilver: +# a. Start scrubbing. +# b. Verify a resilver can be started and it cancels the scrub. +# c. Verify a scrub cannot be started when resilvering +# + +function cleanup +{ + log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME + log_must set_tunable32 SCAN_SUSPEND_PROGRESS \ + $ORIG_SCAN_SUSPEND_PROGRESS + destroy_pool $TESTPOOL1 + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE +} + +log_assert "Scrub was cancelled by resilver" + +ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS) +ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS) + +log_onexit cleanup + +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE + +log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]} +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=64 +log_must zpool sync $TESTPOOL1 + +# Request a healing or sequential resilver +for replace_mode in "healing" "sequential"; do + + # + # Healing resilvers abort the dsl_scan and reconfigure it for + # resilvering. Sequential resilvers cancel the dsl_scan and start + # the vdev_rebuild thread. + # + if [[ "$replace_mode" = "healing" ]]; then + history_msg="scan aborted, restarting" + flags="" + else + history_msg="scan cancelled" + flags="-s" + fi + + # Limit scanning time and suspend the scan as soon as possible. + log_must set_tunable32 RESILVER_MIN_TIME_MS 50 + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + + # Initiate a scrub. + log_must zpool scrub $TESTPOOL1 + + # Initiate a resilver to cancel the scrub. + log_must zpool replace $flags $TESTPOOL1 ${VDEV_FILES[1]} \ + $SPARE_VDEV_FILE + + # Verify the scrub was canceled, it may take a few seconds to exit. + while is_pool_scrubbing $TESTPOOL1; do + sleep 1 + done + log_mustnot is_pool_scrubbing $TESTPOOL1 + + # Verify a scrub cannot be started while resilvering. + log_must is_pool_resilvering $TESTPOOL1 + log_mustnot zpool scrub $TESTPOOL1 + + # Unsuspend resilver. + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must set_tunable32 RESILVER_MIN_TIME_MS 3000 + + # Wait for resilver to finish then put the original back. + log_must zpool wait $TESTPOOL1 + log_must zpool replace $flags -w $TESTPOOL1 $SPARE_VDEV_FILE \ + ${VDEV_FILES[1]} +done +log_pass "Scrub was cancelled by resilver" + diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am deleted file mode 100644 index 38136a843..000000000 --- a/tests/zfs-tests/tests/functional/resilver/Makefile.am +++ /dev/null @@ -1,9 +0,0 @@ -pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver -dist_pkgdata_SCRIPTS = \ - setup.ksh \ - cleanup.ksh \ - resilver_restart_001.ksh \ - resilver_restart_002.ksh - -dist_pkgdata_DATA = \ - resilver.cfg diff --git a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh b/tests/zfs-tests/tests/functional/resilver/cleanup.ksh deleted file mode 100755 index 4dfa81424..000000000 --- a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END - -# -# Copyright (c) 2019, Datto Inc. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg - -verify_runnable "global" - -log_pass diff --git a/tests/zfs-tests/tests/functional/resilver/resilver.cfg b/tests/zfs-tests/tests/functional/resilver/resilver.cfg deleted file mode 100644 index 88dfd24ae..000000000 --- a/tests/zfs-tests/tests/functional/resilver/resilver.cfg +++ /dev/null @@ -1,32 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END - -# -# Copyright (c) 2019, Datto Inc. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib - -verify_runnable "global" - -set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4} -SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1 - -VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 )) diff --git a/tests/zfs-tests/tests/functional/resilver/setup.ksh b/tests/zfs-tests/tests/functional/resilver/setup.ksh deleted file mode 100755 index 4dfa81424..000000000 --- a/tests/zfs-tests/tests/functional/resilver/setup.ksh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END - -# -# Copyright (c) 2019, Datto Inc. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/resilver/resilver.cfg - -verify_runnable "global" - -log_pass