Fix double spares for failed vdev

It's possible for two spares to get attached to a single failed vdev. This happens when you have a failed disk that is spared, and then you replace the failed disk with a new disk, but during the resilver the new disk fails, and ZED kicks in a spare for the failed new disk. This commit checks for that condition and disallows it. Reviewed-by: Akash B <akash-b@hpe.com> Reviewed-by: Ameer Hamza <ahamza@ixsystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes: #16547 Closes: #17231 (cherry picked from commit f40ab9e399)
2026-05-22 10:37:35 +03:00 · 2025-05-02 09:03:11 -07:00
parent cd777ba5ad
commit 4b014840ea
4 changed files with 209 additions and 4 deletions
@@ -123,10 +123,10 @@ tags = ['functional', 'fallocate']
 [tests/functional/fault:Linux]
 tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
    'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
-    'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
-    'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
-    'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
-    'suspend_resume_single', 'zpool_status_-s']
+    'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple',
+    'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault',
+    'decompress_fault', 'fault_limits', 'scrub_after_resilver',
+    'suspend_on_probe_errors', 'suspend_resume_single', 'zpool_status_-s']
 tags = ['functional', 'fault']

 [tests/functional/features/large_dnode:Linux]
@@ -1532,6 +1532,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
+	functional/fault/auto_spare_double.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
 	functional/fault/cleanup.ksh \
@@ -0,0 +1,122 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 by Lawrence Livermore National Security, LLC.
+#
+
+# DESCRIPTION:
+# Try do induce a double spare condition and verify we're prevented from doing
+# it.
+#
+# STRATEGY:
+# 1. Fail a drive
+# 2. Kick in first spare
+# 3. Bring new drive back online and start resilvering to it
+# 4. Immediately after the resilver starts, fail the new drive.
+# 5. Try to kick in the second spare for the new drive (which is now failed)
+# 6. Verify that we can't kick in the second spare
+#
+# Repeat this test for both traditional spares and dRAID spares.
+#
+. $STF_SUITE/include/libtest.shlib
+
+LAST_VDEV=3
+SIZE_MB=300
+
+ZED_PID="$(zed_check)"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+	for i in {0..$LAST_VDEV} ; do
+		log_must rm -f $TEST_BASE_DIR/file$i
+	done
+
+	# Restore ZED if it was running before this test
+	if [ -n $ZED_PID ] ; then
+		log_must zed_start
+	fi
+}
+
+log_assert "Cannot attach two spares to same failed vdev"
+log_onexit cleanup
+
+# Stop ZED if it's running
+if [ -n $ZED_PID ] ; then
+	log_must zed_stop
+fi
+
+log_must truncate -s ${SIZE_MB}M $TEST_BASE_DIR/file{0..$LAST_VDEV}
+
+ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg
+
+# Run the test - we assume the pool is already created.
+# $1: disk to fail
+# $2: 1st spare name
+# $3: 2nd spare name
+function do_test {
+	FAIL_DRIVE=$1
+	SPARE0=$2
+	SPARE1=$3
+	echo 0 > $ZFS_DBGMSG
+	log_must zpool status
+
+	log_note "Kicking in first spare ($SPARE0)"
+	log_must zpool offline -f $TESTPOOL $FAIL_DRIVE
+	log_must zpool replace $TESTPOOL $FAIL_DRIVE $SPARE0
+
+	# Fill the pool with data to make the resilver take a little
+	# time.
+	dd if=/dev/zero of=/$TESTPOOL/testfile bs=1M || true
+
+	# Zero our failed disk.  It will appear as a blank.
+	rm -f $FAIL_DRIVE
+	truncate -s ${SIZE_MB}M $FAIL_DRIVE
+
+	# Attempt to replace our failed disk, then immediately fault it.
+	log_must zpool replace $TESTPOOL $FAIL_DRIVE
+	log_must zpool offline -f $TESTPOOL $FAIL_DRIVE
+	log_must check_state $TESTPOOL $FAIL_DRIVE "faulted"
+
+	log_note "Kicking in second spare ($SPARE0)... This should not work..."
+	log_mustnot zpool replace $TESTPOOL $FAIL_DRIVE $SPARE1
+	# Verify the debug line in dbgmsg
+	log_must grep 'disk would create double spares' $ZFS_DBGMSG
+
+	# Disk should still be faulted
+	log_must check_state $TESTPOOL $FAIL_DRIVE "faulted"
+}
+
+# Test with traditional spares
+log_must zpool create -O compression=off -O recordsize=4k -O primarycache=none \
+	$TESTPOOL mirror $TEST_BASE_DIR/file{0,1} spare $TEST_BASE_DIR/file{2,3}
+do_test $TEST_BASE_DIR/file1 $TEST_BASE_DIR/file2 $TEST_BASE_DIR/file3
+destroy_pool $TESTPOOL
+
+# Clear vdev files for next test
+for i in {0..$LAST_VDEV} ; do
+	log_must rm -f $TEST_BASE_DIR/file$i
+done
+log_must truncate -s ${SIZE_MB}M $TEST_BASE_DIR/file{0..$LAST_VDEV}
+
+# Test with dRAID spares
+log_must zpool create -O compression=off -O recordsize=4k -O primarycache=none \
+	$TESTPOOL draid1:1d:4c:2s $TEST_BASE_DIR/file{0..3}
+do_test $TEST_BASE_DIR/file1 draid1-0-0 draid1-0-1
+
+log_pass "Verified we cannot attach two spares to same failed vdev"