mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
Resilver restarts unnecessarily when it encounters errors
When a resilver finishes, vdev_dtl_reassess is called to hopefully excise DTL_MISSING (amongst other things). If there are errors during the resilver, they are tracked in DTL_SCRUB, as spelled out in the block comment in vdev.c. DTL_SCRUB is in-core only, so it can only be used if the pool was online for the whole resilver. This state is tracked with the spa_scrub_started flag, which only gets set when the scan is initialized. Unfortunately, this flag gets cleared right before vdev_dtl_reassess gets called, so if there are any errors during the scan, DTL_MISSING will never get excised and the resilver will just continually restart. This fix simply moves clearing that flag until after the call to vdev_dtl_reasses. In addition, if a pool is imported and already has scn_errors > 0, this change will restart the resilver immediately instead of doing the rest of the scan and then restarting it from the beginning. On the other hand, if scn_errors == 0 at import, then no errors have been encountered so far, so the spa_scrub_started flag can be safely set. A test has been added to verify that resilver does not restart when relevant DTL's are available. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Paul Zuchowski <pzuchowski@datto.com> Signed-off-by: John Poduska <jpoduska@datto.com> Closes #10291
This commit is contained in:
@@ -758,7 +758,7 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
|
||||
tags = ['functional', 'reservation']
|
||||
|
||||
[tests/functional/resilver]
|
||||
tests = ['resilver_restart_001']
|
||||
tests = ['resilver_restart_001', 'resilver_restart_002']
|
||||
tags = ['functional', 'resilver']
|
||||
|
||||
[tests/functional/rootpool]
|
||||
|
||||
@@ -59,6 +59,7 @@ OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_esti
|
||||
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
|
||||
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment
|
||||
RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms
|
||||
SCAN_LEGACY scan_legacy zfs_scan_legacy
|
||||
SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress
|
||||
SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit
|
||||
SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time
|
||||
|
||||
@@ -2,7 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver
|
||||
dist_pkgdata_SCRIPTS = \
|
||||
setup.ksh \
|
||||
cleanup.ksh \
|
||||
resilver_restart_001.ksh
|
||||
resilver_restart_001.ksh \
|
||||
resilver_restart_002.ksh
|
||||
|
||||
dist_pkgdata_DATA = \
|
||||
resilver.cfg
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
#!/bin/ksh -p
|
||||
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2020, Datto Inc. All rights reserved.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/resilver/resilver.cfg
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Testing resilver completes when scan errors are encountered, but relevant
|
||||
# DTL's have not been lost.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a pool (1k recordsize)
|
||||
# 2. Create a 32m file (32k records)
|
||||
# 3. Inject an error halfway through the file
|
||||
# 4. Start a resilver, ensure the error is triggered and that the resilver
|
||||
# does not restart after finishing
|
||||
#
|
||||
# NB: use legacy scanning to ensure scan of specific block causes error
|
||||
#
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zinject -c all
|
||||
destroy_pool $TESTPOOL
|
||||
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
|
||||
log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY
|
||||
}
|
||||
|
||||
log_assert "Check for resilver restarts caused by scan errors"
|
||||
|
||||
ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY)
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
# use legacy scan to ensure injected error will be triggered
|
||||
log_must set_tunable32 SCAN_LEGACY 1
|
||||
|
||||
# create the pool and a 32M file (32k blocks)
|
||||
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE
|
||||
log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]}
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1
|
||||
|
||||
# determine objset/object
|
||||
objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
|
||||
object=$(ls -i /$TESTPOOL/file | awk '{print $1}')
|
||||
|
||||
# inject event to cause error during resilver
|
||||
log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL
|
||||
|
||||
# clear events and start resilver
|
||||
log_must zpool events -c
|
||||
log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE
|
||||
|
||||
log_note "waiting for read errors to start showing up"
|
||||
for iter in {0..59}
|
||||
do
|
||||
zpool sync $TESTPOOL
|
||||
err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}')
|
||||
(( $err > 0 )) && break
|
||||
sleep 1
|
||||
done
|
||||
|
||||
(( $err == 0 )) && log_fail "Unable to induce errors in resilver"
|
||||
|
||||
log_note "waiting for resilver to finish"
|
||||
for iter in {0..59}
|
||||
do
|
||||
finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l)
|
||||
(( $finish > 0 )) && break
|
||||
sleep 1
|
||||
done
|
||||
|
||||
(( $finish == 0 )) && log_fail "resilver took too long to finish"
|
||||
|
||||
# wait a few syncs to ensure that zfs does not restart the resilver
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must zpool sync $TESTPOOL
|
||||
|
||||
# check if resilver was restarted
|
||||
start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l)
|
||||
(( $start != 1 )) && log_fail "resilver restarted unnecessarily"
|
||||
|
||||
log_pass "Resilver did not restart unnecessarily from scan errors"
|
||||
Reference in New Issue
Block a user