diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index b787b1d5d..8ecb30659 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -70,6 +70,7 @@ typedef struct vdev_rebuild { zfs_range_tree_t *vr_scan_tree; kmutex_t vr_io_lock; /* inflight IO lock */ kcondvar_t vr_io_cv; /* inflight IO cv */ + uint64_t vr_last_txg; /* last used txg */ /* In-core state and progress */ uint64_t vr_scan_offset[TXG_SIZE]; diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 36b3f9e66..384421206 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -593,6 +593,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); + vr->vr_last_txg = txg; spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); @@ -908,8 +909,14 @@ vdev_rebuild_thread(void *arg) error = vdev_rebuild_ranges(vr); zfs_range_tree_vacate(vr->vr_scan_tree, NULL, NULL); - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + /* + * Allow rebuilt ranges to be sync-ed before enabling metaslab + * to avoid any interfering allocations. Otherwise, we might + * see checksum errors after scrub. + */ + txg_wait_synced(dp, vr->vr_last_txg); metaslab_enable(msp, B_FALSE, B_FALSE); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (error != 0) break; diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 702268aee..b9b69e47d 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -360,12 +360,7 @@ function recover_bad_missing_devs # expected state after a healing resilver of a healthy pool. # # 2. sequential - The pool is fully intact. There should never be a -# checksum error, but the occasional checksum error does occur in -# practice. Until the root cause is identified and resolved, tolerate -# a checksum error when scrubbing after a sequential resilver. -# -# https://github.com/openzfs/zfs/issues/18307 -# https://github.com/openzfs/zfs/issues/18319 +# checksum error. # # 3. damaged - The pool was intentionally silently damaged. Checksum # errors are expected to be reported as the damaged blocks are @@ -395,7 +390,7 @@ function verify_draid_pool log_fail "Unexpected repair IO found for $pool ($cksum)" fi elif [[ "$replace_mode" = "sequential" ]]; then - if [[ $cksum -gt 3 ]]; then + if [[ $cksum -gt 0 ]]; then log_must zpool status -v $pool log_fail "Unexpected CKSUM errors found for $pool ($cksum)" fi