diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 02fb55944..852f85355 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3406,7 +3406,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { @@ -3422,13 +3422,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) svr->svr_allocd_segs, SM_ALLOC)); /* - * Clear everything past what has been synced, - * because we have not allocated mappings for it yet. + * Clear everything past what has been synced unless + * it's past the spacemap, because we have not allocated + * mappings for it yet. */ - range_tree_clear(svr->svr_allocd_segs, - vdev_indirect_mapping_max_offset(vim), - msp->ms_sm->sm_start + msp->ms_sm->sm_size - - vdev_indirect_mapping_max_offset(vim)); + uint64_t vim_max_offset = + vdev_indirect_mapping_max_offset(vim); + uint64_t sm_end = msp->ms_sm->sm_start + + msp->ms_sm->sm_size; + if (sm_end > vim_max_offset) + range_tree_clear(svr->svr_allocd_segs, + vim_max_offset, sm_end - vim_max_offset); } zcb->zcb_removing_size += diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0e3459d32..c81d446a5 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -445,6 +445,7 @@ static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; +static boolean_t ztest_device_removal_active = B_FALSE; /* * The ztest_name_lock protects the pool and dataset namespace used by @@ -3203,7 +3204,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * value. Don't bother trying to attach while we are in the middle * of removal. */ - if (spa->spa_vdev_removal != NULL) { + if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&ztest_vdev_lock); return; @@ -3375,16 +3376,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id) spa_t *spa = ztest_spa; vdev_t *vd; uint64_t guid; + int error; mutex_enter(&ztest_vdev_lock); + if (ztest_device_removal_active) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * Remove a random top-level vdev and wait for removal to finish. + */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); guid = vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); - (void) spa_vdev_remove(spa, guid, B_FALSE); + error = spa_vdev_remove(spa, guid, B_FALSE); + if (error == 0) { + ztest_device_removal_active = B_TRUE; + mutex_exit(&ztest_vdev_lock); + while (spa->spa_vdev_removal != NULL) + txg_wait_synced(spa_get_dsl(spa), 0); + } else { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The pool needs to be scrubbed after completing device removal. + * Failure to do so may result in checksum errors due to the + * strategy employed by ztest_fault_inject() when selecting which + * offset are redundant and can be damaged. + */ + error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error == 0) { + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + } + + mutex_enter(&ztest_vdev_lock); + ztest_device_removal_active = B_FALSE; mutex_exit(&ztest_vdev_lock); } @@ -3524,7 +3558,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) * that the metaslab_class space increased (because it decreases * when the device removal completes). */ - if (spa->spa_vdev_removal != NULL) { + if (ztest_device_removal_active) { spa_config_exit(spa, SCL_STATE, FTAG); mutex_exit(&ztest_vdev_lock); return; @@ -5520,6 +5554,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); + + /* + * Device removal is in progress, fault injection must be disabled + * until it completes and the pool is scrubbed. The fault injection + * strategy for damaging blocks does not take in to account evacuated + * blocks which may have already been damaged. + */ + if (ztest_device_removal_active) { + mutex_exit(&ztest_vdev_lock); + goto out; + } + maxfaults = MAXFAULTS(zs); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; mirror_save = zs->zs_mirrors; @@ -5875,6 +5921,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; + /* + * Scrub in progress by device removal. + */ + if (ztest_device_removal_active) + return; + (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ (void) spa_scan(spa, POOL_SCAN_SCRUB); diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 5b1e3056b..80ee09456 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -30,7 +30,7 @@ extern "C" { #endif typedef struct spa_vdev_removal { - vdev_t *svr_vdev; + uint64_t svr_vdev_id; uint64_t svr_max_offset_to_sync[TXG_SIZE]; /* Thread performing a vdev removal. */ kthread_t *svr_thread; diff --git a/include/sys/zio.h b/include/sys/zio.h index a275b16de..dc6841d83 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -600,7 +600,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - struct abd *data, uint64_t size, int type, zio_priority_t priority, + struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 99b4ea29a..2d94cd320 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2879,7 +2879,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " - "or pool has removing/removed vdevs"), + "or device removal is in progress"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 818f26449..6e670facb 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1739,6 +1739,23 @@ Include cache hits in read history Use \fB1\fR for yes and \fB0\fR for no (default). .RE +.sp +.ne 2 +.na +\fBzfs_reconstruct_indirect_segments_max\fR (int) +.ad +.RS 12n +When a split block which is part of a indirect vdev contains more than this +many segments, consider it too computationally expensive to check all possible +combinations. Instead, operate under the assumption that only a few segment +copies are damaged and the majority of segment copies are good, in which case +it is reasonable to randomly select sample combinations. This allows all the +segment copies to participate fairly in the reconstruction and prevents the +repeated use of one bad copy. +.sp +Default value: \fB10\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 53953a6c5..b87b4d555 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2959,6 +2959,19 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, { vdev_t *vd; + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops == &vdev_indirect_ops) { + /* + * The indirect vdev can point to multiple + * vdevs. For simplicity, always create + * the resilver zio_t. zio_vdev_io_start() + * will bypass the child resilver i/o's if + * they are on vdevs that don't have DTL's. + */ + return (B_TRUE); + } + if (DVA_GET_GANG(dva)) { /* * Gang members may be spread across multiple @@ -2971,8 +2984,6 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, return (B_TRUE); } - vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); - /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 25090f089..52a60cc5e 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -3234,7 +3234,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, return; if (spa->spa_vdev_removal != NULL && - spa->spa_vdev_removal->svr_vdev == vd && + spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 08fc7bbda..9a5346b42 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -4860,8 +4860,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && - tvd->vdev_ashift != - spa->spa_vdev_removal->svr_vdev->vdev_ashift) { + tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ @@ -4970,10 +4969,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); - if (spa->spa_vdev_removal != NULL || - spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 5c6e82567..54299a7e4 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1708,9 +1708,12 @@ spa_update_dspace(spa_t *spa) * allocated twice (on the old device and the new * device). */ - vdev_t *vd = spa->spa_vdev_removal->svr_vdev; + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *vd = + vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); spa->spa_dspace -= spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + spa_config_exit(spa, SCL_VDEV, FTAG); } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f3e3f90fa..0dea0b4e4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -857,6 +857,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; + /* + * State which may be set on a top-level vdev that's in the + * process of being removed. + */ + ASSERT0(tvd->vdev_indirect_config.vic_births_object); + ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); + ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); + ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); + ASSERT3P(tvd->vdev_indirect_births, ==, NULL); + ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0(tvd->vdev_removing); + tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_indirect_config = svd->vdev_indirect_config; + tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; + tvd->vdev_indirect_births = svd->vdev_indirect_births; + range_tree_swap(&svd->vdev_obsolete_segments, + &tvd->vdev_obsolete_segments); + tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; + svd->vdev_indirect_config.vic_mapping_object = 0; + svd->vdev_indirect_config.vic_births_object = 0; + svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; + svd->vdev_indirect_mapping = NULL; + svd->vdev_indirect_births = NULL; + svd->vdev_obsolete_sm = NULL; + svd->vdev_removing = 0; + for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 86a05daa8..3ccdfee3b 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -44,10 +45,11 @@ * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * - * - reads and repair writes to this device use the callback to create - * a child io for each mapped segment. + * - i/os to this vdev use the callback to determine where the + * data is now located, and issue child i/os for each segment's new + * location. * - * - frees and claims to this device use the callback to free or claim + * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak @@ -201,6 +203,95 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; */ int zfs_condense_indirect_commit_entry_delay_ms = 0; +/* + * If a split block contains more than this many segments, consider it too + * computationally expensive to check all (2^num_segments) possible + * combinations. Instead, try at most 2^_segments_max randomly-selected + * combinations. + * + * This is reasonable if only a few segment copies are damaged and the + * majority of segment copies are good. It allows all segment copies to + * participate fairly in the reconstruction and prevents repeated use of + * one bad copy. + */ +int zfs_reconstruct_indirect_segments_max = 10; + +/* + * The indirect_child_t represents the vdev that we will read from, when we + * need to read all copies of the data (e.g. for scrub or reconstruction). + * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), + * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, + * ic_vdev is a child of the mirror. + */ +typedef struct indirect_child { + abd_t *ic_data; + vdev_t *ic_vdev; +} indirect_child_t; + +/* + * The indirect_split_t represents one mapped segment of an i/o to the + * indirect vdev. For non-split (contiguously-mapped) blocks, there will be + * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. + * For split blocks, there will be several of these. + */ +typedef struct indirect_split { + list_node_t is_node; /* link on iv_splits */ + + /* + * is_split_offset is the offset into the i/o. + * This is the sum of the previous splits' is_size's. + */ + uint64_t is_split_offset; + + vdev_t *is_vdev; /* top-level vdev */ + uint64_t is_target_offset; /* offset on is_vdev */ + uint64_t is_size; + int is_children; /* number of entries in is_child[] */ + + /* + * is_good_child is the child that we are currently using to + * attempt reconstruction. + */ + int is_good_child; + + indirect_child_t is_child[1]; /* variable-length */ +} indirect_split_t; + +/* + * The indirect_vsd_t is associated with each i/o to the indirect vdev. + * It is the "Vdev-Specific Data" in the zio_t's io_vsd. + */ +typedef struct indirect_vsd { + boolean_t iv_split_block; + boolean_t iv_reconstruct; + + list_t iv_splits; /* list of indirect_split_t's */ +} indirect_vsd_t; + +static void +vdev_indirect_map_free(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + indirect_split_t *is; + while ((is = list_head(&iv->iv_splits)) != NULL) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic->ic_data != NULL) + abd_free(ic->ic_data); + } + list_remove(&iv->iv_splits, is); + kmem_free(is, + offsetof(indirect_split_t, is_child[is->is_children])); + } + kmem_free(iv, sizeof (*iv)); +} + +static const zio_vsd_ops_t vdev_indirect_vsd_ops = { + vdev_indirect_map_free, + zio_vsd_default_cksum_report +}; + /* * Mark the given offset and size as being obsolete in the given txg. */ @@ -813,12 +904,6 @@ vdev_indirect_close(vdev_t *vd) { } -/* ARGSUSED */ -static void -vdev_indirect_io_done(zio_t *zio) -{ -} - /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, @@ -990,41 +1075,471 @@ vdev_indirect_child_io_done(zio_t *zio) abd_put(zio->io_abd); } +/* + * This is a callback for vdev_indirect_remap() which allocates an + * indirect_split_t for each split segment and adds it to iv_splits. + */ static void -vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, +vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; + indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; - zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, - abd_get_offset(zio->io_abd, split_offset), - size, zio->io_type, zio->io_priority, - 0, vdev_indirect_child_io_done, zio)); + int n = 1; + if (vd->vdev_ops == &vdev_mirror_ops) + n = vd->vdev_children; + + indirect_split_t *is = + kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); + + is->is_children = n; + is->is_size = size; + is->is_split_offset = split_offset; + is->is_target_offset = offset; + is->is_vdev = vd; + + /* + * Note that we only consider multiple copies of the data for + * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even + * though they use the same ops as mirror, because there's only one + * "good" copy under the replacing/spare. + */ + if (vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < n; i++) { + is->is_child[i].ic_vdev = vd->vdev_child[i]; + } + } else { + is->is_child[0].ic_vdev = vd; + } + + list_insert_tail(&iv->iv_splits, is); +} + +static void +vdev_indirect_read_split_done(zio_t *zio) +{ + indirect_child_t *ic = zio->io_private; + + if (zio->io_error != 0) { + /* + * Clear ic_data to indicate that we do not have data for this + * child. + */ + abd_free(ic->ic_data); + ic->ic_data = NULL; + } +} + +/* + * Issue reads for all copies (mirror children) of all splits. + */ +static void +vdev_indirect_read_all(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int i = 0; i < is->is_children; i++) { + indirect_child_t *ic = &is->is_child[i]; + + if (!vdev_readable(ic->ic_vdev)) + continue; + + /* + * Note, we may read from a child whose DTL + * indicates that the data may not be present here. + * While this might result in a few i/os that will + * likely return incorrect data, it simplifies the + * code since we can treat scrub and resilver + * identically. (The incorrect data will be + * detected and ignored when we verify the + * checksum.) + */ + + ic->ic_data = abd_alloc_sametype(zio->io_abd, + is->is_size); + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, ic->ic_data, + is->is_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_read_split_done, ic)); + } + } + iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { ASSERTV(spa_t *spa = zio->io_spa); + indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); + list_create(&iv->iv_splits, + sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); + + zio->io_vsd = iv; + zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - ASSERT((zio->io_flags & - (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); + /* + * Note: this code can handle other kinds of writes, + * but we don't expect them. + */ + ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, - vdev_indirect_io_start_cb, zio); + vdev_indirect_gather_splits, zio); + + indirect_split_t *first = list_head(&iv->iv_splits); + if (first->is_size == zio->io_size) { + /* + * This is not a split block; we are pointing to the entire + * data, which will checksum the same as the original data. + * Pass the BP down so that the child i/o can verify the + * checksum, and try a different location if available + * (e.g. on a mirror). + * + * While this special case could be handled the same as the + * general (split block) case, doing it this way ensures + * that the vast majority of blocks on indirect vdevs + * (which are not split) are handled identically to blocks + * on non-indirect vdevs. This allows us to be less strict + * about performance in the general (but rare) case. + */ + ASSERT0(first->is_split_offset); + ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + first->is_vdev, first->is_target_offset, + abd_get_offset(zio->io_abd, 0), + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } else { + iv->iv_split_block = B_TRUE; + if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + /* + * Read all copies. Note that for simplicity, + * we don't bother consulting the DTL in the + * resilver case. + */ + vdev_indirect_read_all(zio); + } else { + /* + * Read one copy of each split segment, from the + * top-level vdev. Since we don't know the + * checksum of each split individually, the child + * zio can't ensure that we get the right data. + * E.g. if it's a mirror, it will just read from a + * random (healthy) leaf vdev. We have to verify + * the checksum in vdev_indirect_io_done(). + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + zio_nowait(zio_vdev_child_io(zio, NULL, + is->is_vdev, is->is_target_offset, + abd_get_offset(zio->io_abd, + is->is_split_offset), is->is_size, + zio->io_type, zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } + + } + } zio_execute(zio); } +/* + * Report a checksum error for a child. + */ +static void +vdev_indirect_checksum_error(zio_t *zio, + indirect_split_t *is, indirect_child_t *ic) +{ + vdev_t *vd = ic->ic_vdev; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zio_bad_cksum_t zbc = {{{ 0 }}}; + abd_t *bad_abd = ic->ic_data; + abd_t *good_abd = is->is_child[is->is_good_child].ic_data; + zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, + is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); +} + +/* + * Issue repair i/os for any incorrect copies. We do this by comparing + * each split segment's correct data (is_good_child's ic_data) with each + * other copy of the data. If they differ, then we overwrite the bad data + * with the good copy. Note that we do this without regard for the DTL's, + * which simplifies this code and also issues the optimal number of writes + * (based on which copies actually read bad data, as opposed to which we + * think might be wrong). For the same reason, we always use + * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). + */ +static void +vdev_indirect_repair(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + enum zio_flag flags = ZIO_FLAG_IO_REPAIR; + + if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) + flags |= ZIO_FLAG_SELF_HEAL; + + if (!spa_writeable(zio->io_spa)) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + indirect_child_t *good_child = &is->is_child[is->is_good_child]; + + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic == good_child) + continue; + if (ic->ic_data == NULL) + continue; + if (abd_cmp(good_child->ic_data, ic->ic_data) == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, + good_child->ic_data, is->is_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL)); + + vdev_indirect_checksum_error(zio, is, ic); + } + } +} + +/* + * Report checksum errors on all children that we read from. + */ +static void +vdev_indirect_all_checksum_errors(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + + if (ic->ic_data == NULL) + continue; + + vdev_t *vd = ic->ic_vdev; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, + is->is_target_offset, is->is_size, + NULL, NULL, NULL); + } + } +} + +/* + * This function is called when we have read all copies of the data and need + * to try to find a combination of copies that gives us the right checksum. + * + * If we pointed to any mirror vdevs, this effectively does the job of the + * mirror. The mirror vdev code can't do its own job because we don't know + * the checksum of each split segment individually. We have to try every + * combination of copies of split segments, until we find one that checksums + * correctly. (Or until we have tried all combinations, or have tried + * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we + * set io_error to ECKSUM to propagate the error up to the user.) + * + * For example, if we have 3 segments in the split, + * and each points to a 2-way mirror, we will have the following pieces of + * data: + * + * | mirror child + * split | [0] [1] + * ======|===================== + * A | data_A_0 data_A_1 + * B | data_B_0 data_B_1 + * C | data_C_0 data_C_1 + * + * We will try the following (mirror children)^(number of splits) (2^3=8) + * combinations, which is similar to bitwise-little-endian counting in + * binary. In general each "digit" corresponds to a split segment, and the + * base of each digit is is_children, which can be different for each + * digit. + * + * "low bit" "high bit" + * v v + * data_A_0 data_B_0 data_C_0 + * data_A_1 data_B_0 data_C_0 + * data_A_0 data_B_1 data_C_0 + * data_A_1 data_B_1 data_C_0 + * data_A_0 data_B_0 data_C_1 + * data_A_1 data_B_0 data_C_1 + * data_A_0 data_B_1 data_C_1 + * data_A_1 data_B_1 data_C_1 + * + * Note that the split segments may be on the same or different top-level + * vdevs. In either case, we try lots of combinations (see + * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has + * small silent errors on all of its children, we can still reconstruct the + * correct data, as long as those errors are at sufficiently-separated + * offsets (specifically, separated by the largest block size - default of + * 128KB, but up to 16MB). + */ +static void +vdev_indirect_reconstruct_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + uint64_t attempts = 0; + uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; + int segments = 0; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) + segments++; + + for (;;) { + /* copy data from splits to main zio */ + int ret; + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + + /* + * If this child failed, its ic_data will be NULL. + * Skip this combination. + */ + if (is->is_child[is->is_good_child].ic_data == NULL) { + ret = EIO; + goto next; + } + + abd_copy_off(zio->io_abd, + is->is_child[is->is_good_child].ic_data, + is->is_split_offset, 0, is->is_size); + } + + /* See if this checksum matches. */ + zio_bad_cksum_t zbc; + ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + /* Found a matching checksum. Issue repair i/os. */ + vdev_indirect_repair(zio); + zio_checksum_verified(zio); + return; + } + + /* + * Checksum failed; try a different combination of split + * children. + */ + boolean_t more; +next: + more = B_FALSE; + if (segments <= zfs_reconstruct_indirect_segments_max) { + /* + * There are relatively few segments, so + * deterministically check all combinations. We do + * this by by adding one to the first split's + * good_child. If it overflows, then "carry over" to + * the next split (like counting in base is_children, + * but each digit can have a different base). + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child++; + if (is->is_good_child < is->is_children) { + more = B_TRUE; + break; + } + is->is_good_child = 0; + } + } else if (++attempts < attempts_max) { + /* + * There are too many combinations to try all of them + * in a reasonable amount of time, so try a fixed + * number of random combinations, after which we'll + * consider the block unrecoverable. + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child = + spa_get_random(is->is_children); + } + more = B_TRUE; + } + if (!more) { + /* All combinations failed. */ + zio->io_error = ret; + vdev_indirect_all_checksum_errors(zio); + zio_checksum_verified(zio); + return; + } + } +} + +static void +vdev_indirect_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (iv->iv_reconstruct) { + /* + * We have read all copies of the data (e.g. from mirrors), + * either because this was a scrub/resilver, or because the + * one-copy read didn't checksum correctly. + */ + vdev_indirect_reconstruct_io_done(zio); + return; + } + + if (!iv->iv_split_block) { + /* + * This was not a split block, so we passed the BP down, + * and the checksum was handled by the (one) child zio. + */ + return; + } + + zio_bad_cksum_t zbc; + int ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + zio_checksum_verified(zio); + return; + } + + /* + * The checksum didn't match. Read all copies of all splits, and + * then we will try to reconstruct. The next time + * vdev_indirect_io_done() is called, iv_reconstruct will be set. + */ + vdev_indirect_read_all(zio); + + zio_vdev_io_redone(zio); +} + vdev_ops_t vdev_indirect_ops = { vdev_indirect_open, vdev_indirect_close, @@ -1061,4 +1576,8 @@ MODULE_PARM_DESC(zfs_condense_min_mapping_bytes, module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644); MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, "Delay while condensing vdev mapping"); + +module_param(zfs_reconstruct_indirect_segments_max, int, 0644); +MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max, + "Maximum number of split segments check all combinations"); #endif diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index b56f955eb..4b01f317b 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -486,12 +486,15 @@ vdev_mirror_io_start(zio_t *zio) mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { - if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { + if (zio->io_bp != NULL && + (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { /* - * For scrubbing reads we need to allocate a read - * buffer for each child and issue reads to all - * children. If any child succeeds, it will copy its - * data into zio->io_data in vdev_mirror_scrub_done. + * For scrubbing reads (if we can verify the + * checksum here, as indicated by io_bp being + * non-NULL) we need to allocate a read buffer for + * each child and issue reads to all children. If + * any child succeeds, it will copy its data into + * zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -640,7 +643,21 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_error == 0) { if (mc->mc_tried) continue; + /* + * We didn't try this child. We need to + * repair it if: + * 1. it's a scrub (in which case we have + * tried everything that was healthy) + * - or - + * 2. it's an indirect vdev (in which case + * it could point to any other vdev, which + * might have a bad DTL) + * - or - + * 3. the DTL indicates that this data is + * missing from this vdev + */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + mc->mc_vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 6e81bf014..0fca8fb03 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -84,18 +84,12 @@ typedef struct vdev_copy_arg { kmutex_t vca_lock; } vdev_copy_arg_t; -typedef struct vdev_copy_seg_arg { - vdev_copy_arg_t *vcsa_copy_arg; - uint64_t vcsa_txg; - dva_t *vcsa_dest_dva; - blkptr_t *vcsa_dest_bp; -} vdev_copy_seg_arg_t; - /* - * The maximum amount of allowed data we're allowed to copy from a device - * at a time when removing it. + * The maximum amount of memory we can use for outstanding i/o while + * doing a device removal. This determines how much i/o we can have + * in flight concurrently. */ -int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; +int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when @@ -165,7 +159,7 @@ spa_vdev_removal_create(vdev_t *vd) mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); svr->svr_allocd_segs = range_tree_create(NULL, NULL); - svr->svr_vdev = vd; + svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { svr->svr_frees[i] = range_tree_create(NULL, NULL); @@ -207,9 +201,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr) static void vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) { - vdev_t *vd = arg; + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; ASSERTV(uint64_t txg = dmu_tx_get_txg(tx)); @@ -331,7 +326,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) ASSERT3P(spa->spa_vdev_removal, ==, NULL); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, - spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); + spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* @@ -372,21 +367,24 @@ spa_remove_init(spa_t *spa) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_removing_phys.sr_removing_vdev); - spa_config_exit(spa, SCL_STATE, FTAG); - if (vd == NULL) + if (vd == NULL) { + spa_config_exit(spa, SCL_STATE, FTAG); return (EINVAL); + } vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT(vdev_is_concrete(vd)); spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); - ASSERT(svr->svr_vdev->vdev_removing); + ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); + ASSERT(vd->vdev_removing); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); + spa_config_exit(spa, SCL_STATE, FTAG); spa->spa_vdev_removal = svr; } @@ -439,15 +437,8 @@ spa_restart_removal(spa_t *spa) if (!spa_writeable(spa)) return; - vdev_t *vd = svr->svr_vdev; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - ASSERT3P(vd, !=, NULL); - ASSERT(vd->vdev_removing); - - zfs_dbgmsg("restarting removal of %llu at count=%llu", - vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); - svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, + zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); + svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } @@ -468,7 +459,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); - ASSERT3P(vd, ==, svr->svr_vdev); + ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); ASSERT3U(spa_syncing_txg(spa), ==, txg); mutex_enter(&svr->svr_lock); @@ -646,7 +637,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) if (state == DSS_FINISHED) { spa_removing_phys_t *srp = &spa->spa_removing_phys; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (srp->sr_prev_indirect_vdev != UINT64_MAX) { @@ -690,7 +681,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; @@ -718,64 +709,128 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) spa_sync_removing_state(spa, tx); } +/* + * All reads and writes associated with a call to spa_vdev_copy_segment() + * are done. + */ +static void +spa_vdev_copy_nullzio_done(zio_t *zio) +{ + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The write of the new location is done. + */ static void spa_vdev_copy_segment_write_done(zio_t *zio) { - vdev_copy_seg_arg_t *vcsa = zio->io_private; - vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; - spa_config_exit(zio->io_spa, SCL_STATE, FTAG); + vdev_copy_arg_t *vca = zio->io_private; + abd_free(zio->io_abd); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); - - ASSERT0(zio->io_error); - kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); - kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); } +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ static void spa_vdev_copy_segment_read_done(zio_t *zio) { - vdev_copy_seg_arg_t *vcsa = zio->io_private; - dva_t *dest_dva = vcsa->vcsa_dest_dva; - uint64_t txg = vcsa->vcsa_txg; - spa_t *spa = zio->io_spa; - ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva))); - blkptr_t *bp = NULL; - dva_t *dva = NULL; - uint64_t size = zio->io_size; - - ASSERT3P(dest_vd, !=, NULL); - ASSERT0(zio->io_error); - - vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - bp = vcsa->vcsa_dest_bp; - dva = bp->blk_dva; - - BP_ZERO(bp); - - /* initialize with dest_dva */ - bcopy(dest_dva, dva, sizeof (dva_t)); - BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); - - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); - BP_SET_TYPE(bp, DMU_OT_NONE); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, - txg, bp, zio->io_abd, size, - spa_vdev_copy_segment_write_done, vcsa, - ZIO_PRIORITY_REMOVAL, 0, NULL)); + zio_nowait(zio_unique_parent(zio)); } +/* + * If the old and new vdevs are mirrors, we will read both sides of the old + * mirror, and write each copy to the corresponding side of the new mirror. + * If the old and new vdevs have a different number of children, we will do + * this as best as possible. Since we aren't verifying checksums, this + * ensures that as long as there's a good copy of the data, we'll have a + * good copy after the removal, even if there's silent damage to one side + * of the mirror. If we're removing a mirror that has some silent damage, + * we'll have exactly the same damage in the new location (assuming that + * the new location is also a mirror). + * + * We accomplish this by creating a tree of zio_t's, with as many writes as + * there are "children" of the new vdev (a non-redundant vdev counts as one + * child, a 2-way mirror has 2 children, etc). Each write has an associated + * read from a child of the old vdev. Typically there will be the same + * number of children of the old and new vdevs. However, if there are more + * children of the new vdev, some child(ren) of the old vdev will be issued + * multiple reads. If there are more children of the old vdev, some copies + * will be dropped. + * + * For example, the tree of zio_t's for a 2-way mirror is: + * + * null + * / \ + * write(new vdev, child 0) write(new vdev, child 1) + * | | + * read(old vdev, child 0) read(old vdev, child 1) + * + * Child zio's complete before their parents complete. However, zio's + * created with zio_vdev_child_io() may be issued before their children + * complete. In this case we need to make sure that the children (reads) + * complete before the parents (writes) are *issued*. We do this by not + * calling zio_nowait() on each write until its corresponding read has + * completed. + * + * The spa_config_lock must be held while zio's created by + * zio_vdev_child_io() are in progress, to ensure that the vdev tree does + * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" + * zio is needed to release the spa_config_lock after all the reads and + * writes complete. (Note that we can't grab the config lock for each read, + * because it is not reentrant - we could deadlock with a thread waiting + * for a write lock.) + */ +static void +spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, + vdev_t *source_vd, uint64_t source_offset, + vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) +{ + ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes += size; + mutex_exit(&vca->vca_lock); + + abd_t *abd = abd_alloc_for_io(size, B_FALSE); + + vdev_t *source_child_vd; + if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { + /* + * Source and dest are both mirrors. Copy from the same + * child id as we are copying to (wrapping around if there + * are more dest children than source children). + */ + source_child_vd = + source_vd->vdev_child[dest_id % source_vd->vdev_children]; + } else { + source_child_vd = source_vd; + } + + zio_t *write_zio = zio_vdev_child_io(nzio, NULL, + dest_child_vd, dest_offset, abd, size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_write_done, vca); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + source_child_vd, source_offset, abd, size, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_read_done, vca)); +} + +/* + * Allocate a new location for this segment, and create the zio_t's to + * read from the old location and write to the new location. + */ static int spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) @@ -784,10 +839,7 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; - vdev_copy_seg_arg_t *private; dva_t dst = {{ 0 }}; - blkptr_t blk, *bp = &blk; - dva_t *dva = bp->blk_dva; ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); @@ -804,51 +856,28 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, */ ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); - mutex_enter(&vca->vca_lock); - vca->vca_outstanding_bytes += size; - mutex_exit(&vca->vca_lock); - entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; - private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); - private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; - private->vcsa_txg = txg; - private->vcsa_copy_arg = vca; - /* - * This lock is eventually released by the donefunc for the - * zio_write_phys that finishes copying the data. + * See comment before spa_vdev_copy_one_child(). */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - - /* - * Do logical I/O, letting the redundancy vdevs (like mirror) - * handle their own I/O instead of duplicating that code here. - */ - BP_ZERO(bp); - - DVA_SET_VDEV(&dva[0], vd->vdev_id); - DVA_SET_OFFSET(&dva[0], start); - DVA_SET_GANG(&dva[0], 0); - DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); - - BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); - - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); - BP_SET_TYPE(bp, DMU_OT_NONE); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, - bp, abd_alloc_for_io(size, B_FALSE), size, - spa_vdev_copy_segment_read_done, private, - ZIO_PRIORITY_REMOVAL, 0, NULL)); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, + spa_vdev_copy_nullzio_done, NULL, 0); + vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); + if (dest_vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < dest_vd->vdev_children; i++) { + vdev_t *child = dest_vd->vdev_child[i]; + spa_vdev_copy_one_child(vca, nzio, vd, start, + child, DVA_GET_OFFSET(&dst), i, size); + } + } else { + spa_vdev_copy_one_child(vca, nzio, vd, start, + dest_vd, DVA_GET_OFFSET(&dst), -1, size); + } + zio_nowait(nzio); list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); @@ -866,8 +895,8 @@ static void vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; - vdev_t *vd = svr->svr_vdev; - spa_t *spa = vd->vdev_spa; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); @@ -895,37 +924,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) "%s vdev %llu", spa_name(spa), vd->vdev_id); } -static void -vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) -{ - ivd->vdev_indirect_config = vd->vdev_indirect_config; - - ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); - ASSERT(vd->vdev_indirect_mapping != NULL); - ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; - vd->vdev_indirect_mapping = NULL; - - ASSERT3P(ivd->vdev_indirect_births, ==, NULL); - ASSERT(vd->vdev_indirect_births != NULL); - ivd->vdev_indirect_births = vd->vdev_indirect_births; - vd->vdev_indirect_births = NULL; - - ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); - ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); - - if (vd->vdev_obsolete_sm != NULL) { - ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); - - /* - * We cannot use space_map_{open,close} because we hold all - * the config locks as writer. - */ - ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); - ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; - vd->vdev_obsolete_sm = NULL; - } -} - static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { @@ -961,17 +959,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) vdev_remove_enlist_zaps(vd, svr->svr_zaplist); ivd = vdev_add_parent(vd, &vdev_indirect_ops); + ivd->vdev_removing = 0; vd->vdev_leaf_zap = 0; vdev_remove_child(ivd, vd); vdev_compact_children(ivd); - vdev_indirect_state_transfer(ivd, vd); - - svr->svr_vdev = ivd; - - ASSERT(!ivd->vdev_removing); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -994,9 +988,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) * context by the removal thread after we have copied all vdev's data. */ static void -vdev_remove_complete(vdev_t *vd) +vdev_remove_complete(spa_t *spa) { - spa_t *spa = vd->vdev_spa; uint64_t txg; /* @@ -1004,8 +997,12 @@ vdev_remove_complete(vdev_t *vd) * vdev_metaslab_fini() */ txg_wait_synced(spa->spa_dsl_pool, 0); - txg = spa_vdev_enter(spa); + vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", vd->vdev_id, txg); @@ -1025,6 +1022,10 @@ vdev_remove_complete(vdev_t *vd) /* * We now release the locks, allowing spa_sync to run and finish the * removal via vdev_remove_complete_sync in syncing context. + * + * Note that we hold on to the vdev_t that has been replaced. Since + * it isn't part of the vdev tree any longer, it can't be concurrently + * manipulated, even while we don't have the config lock. */ (void) spa_vdev_exit(spa, NULL, txg, 0); @@ -1046,6 +1047,9 @@ vdev_remove_complete(vdev_t *vd) */ vdev_config_dirty(spa->spa_root_vdev); (void) spa_vdev_exit(spa, vd, txg, 0); + + if (ev != NULL) + spa_event_post(ev); } /* @@ -1056,7 +1060,7 @@ vdev_remove_complete(vdev_t *vd) * this size again this txg. */ static void -spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, +spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, uint64_t *max_alloc, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); @@ -1095,7 +1099,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, while (length > 0) { uint64_t mylen = MIN(length, thismax); - int error = spa_vdev_copy_segment(svr->svr_vdev, + int error = spa_vdev_copy_segment(vd, offset, mylen, txg, vca, &zal); if (error == ENOSPC) { @@ -1153,12 +1157,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, static void spa_vdev_remove_thread(void *arg) { - vdev_t *vd = arg; - spa_t *spa = vd->vdev_spa; + spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; uint64_t max_alloc = zfs_remove_max_segment; uint64_t last_txg = 0; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); @@ -1166,7 +1172,6 @@ spa_vdev_remove_thread(void *arg) ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_removing); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); - ASSERT3P(svr->svr_vdev, ==, vd); ASSERT(vim != NULL); mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1247,6 +1252,17 @@ spa_vdev_remove_thread(void *arg) mutex_exit(&svr->svr_lock); + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vca.vca_lock); while (vca.vca_outstanding_bytes > zfs_remove_max_copy_bytes) { @@ -1260,11 +1276,19 @@ spa_vdev_remove_thread(void *arg) VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); + /* + * Reacquire the vdev_config lock. The vdev_t + * that we're removing may have changed, e.g. due + * to a vdev_attach or vdev_detach. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vd = vdev_lookup_top(spa, svr->svr_vdev_id); + if (txg != last_txg) max_alloc = zfs_remove_max_segment; last_txg = txg; - spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); + spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); @@ -1272,6 +1296,9 @@ spa_vdev_remove_thread(void *arg) } mutex_exit(&svr->svr_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + /* * Wait for all copies to finish before cleaning up the vca. */ @@ -1289,7 +1316,7 @@ spa_vdev_remove_thread(void *arg) mutex_exit(&svr->svr_lock); } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); - vdev_remove_complete(vd); + vdev_remove_complete(spa); } } @@ -1330,7 +1357,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; @@ -1403,8 +1430,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) * because we have not allocated mappings for it yet. */ uint64_t syncd = vdev_indirect_mapping_max_offset(vim); - range_tree_clear(svr->svr_allocd_segs, syncd, - msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); + uint64_t sm_end = msp->ms_sm->sm_start + + msp->ms_sm->sm_size; + if (sm_end > syncd) + range_tree_clear(svr->svr_allocd_segs, + syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); } @@ -1465,7 +1495,7 @@ spa_vdev_remove_cancel(spa_t *spa) if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); - uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; + uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE); @@ -1774,7 +1804,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_initiate_sync, - vd, 0, ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); return (0); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 5259a0c6f..9bb2459bb 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1212,17 +1212,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); - /* - * In the common case, where the parent zio was to a normal vdev, - * the child zio must be to a child vdev of that vdev. Otherwise, - * the child zio must be to a top-level vdev. - */ - if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) { - ASSERT3P(vd->vdev_parent, ==, pio->io_vd); - } else { - ASSERT3P(vd, ==, vd->vdev_top); - } - if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the @@ -1283,7 +1272,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, - int type, zio_priority_t priority, enum zio_flag flags, + zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; @@ -3481,7 +3470,7 @@ zio_vdev_io_start(zio_t *zio) */ ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | - ZIO_FLAG_INDUCE_DAMAGE)); + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } align = 1ULL << vd->vdev_top->vdev_ashift; @@ -3521,18 +3510,37 @@ zio_vdev_io_start(zio_t *zio) * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. - * This prevents spurious resilvering with nested replication. - * For example, given a mirror of mirrors, (A+B)+(C+D), if only - * A is out of date, we'll read from C+D, then use the data to - * resilver A+B -- but we don't actually want to resilver B, just A. - * The top-level mirror has no way to know this, so instead we just - * discard unnecessary repairs as we work our way down the vdev tree. - * The same logic applies to any form of nested replication: - * ditto + mirror, RAID-Z + replacing, etc. This covers them all. + * This prevents spurious resilvering. + * + * There are a few ways that we can end up creating these spurious + * resilver i/os: + * + * 1. A resilver i/o will be issued if any DVA in the BP has a + * dirty DTL. The mirror code will issue resilver writes to + * each DVA, including the one(s) that are not on vdevs with dirty + * DTLs. + * + * 2. With nested replication, which happens when we have a + * "replacing" or "spare" vdev that's a child of a mirror or raidz. + * For example, given mirror(replacing(A+B), C), it's likely that + * only A is out of date (it's the new device). In this case, we'll + * read from C, then use the data to resilver A+B -- but we don't + * actually want to resilver B, just A. The top-level mirror has no + * way to know this, so instead we just discard unnecessary repairs + * as we work our way down the vdev tree. + * + * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. + * The same logic applies to any form of nested replication: ditto + * + mirror, RAID-Z + replacing, etc. + * + * However, indirect vdevs point off to other vdevs which may have + * DTL's, so we never bypass them. The child i/os on concrete vdevs + * will be properly bypassed instead. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ + vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio);