Fix double spares for failed vdev

It's possible for two spares to get attached to a single failed vdev. This happens when you have a failed disk that is spared, and then you replace the failed disk with a new disk, but during the resilver the new disk fails, and ZED kicks in a spare for the failed new disk. This commit checks for that condition and disallows it. Reviewed-by: Akash B <akash-b@hpe.com> Reviewed-by: Ameer Hamza <ahamza@ixsystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes: #16547 Closes: #17231
2025-10-14 04:15:00 +03:00 · 2025-05-02 09:03:11 -07:00 · 2025-05-02 09:03:11 -07:00 · f40ab9e399
commit f40ab9e399
parent 3b18877269
4 changed files with 209 additions and 4 deletions
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@ -7430,6 +7430,82 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 	return (0);
 }
 /*
 * Given a vdev to be replaced and its parent, check for a possible
 * "double spare" condition if a vdev is to be replaced by a spare.  When this
 * happens, you can get two spares assigned to one failed vdev.
 *
 * To trigger a double spare condition:
 *
 * 1. disk1 fails
 * 2. 1st spare is kicked in for disk1 and it resilvers
 * 3. Someone replaces disk1 with a new blank disk
 * 4. New blank disk starts resilvering
 * 5. While resilvering, new blank disk has IO errors and faults
 * 6. 2nd spare is kicked in for new blank disk
 * 7. At this point two spares are kicked in for the original disk1.
 *
 * It looks like this:
 *
 * NAME                                            STATE     READ WRITE CKSUM
 * tank2                                           DEGRADED     0     0     0
 *   draid2:6d:10c:2s-0                            DEGRADED     0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d1                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d2                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d3                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d4                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d5                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d6                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d7                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d8                 ONLINE       0     0     0
 *     scsi-0QEMU_QEMU_HARDDISK_d9                 ONLINE       0     0     0
 *     spare-9                                     DEGRADED     0     0     0
 *       replacing-0                               DEGRADED     0    93     0
 *         scsi-0QEMU_QEMU_HARDDISK_d10-part1/old  UNAVAIL      0     0     0
 *         spare-1                                 DEGRADED     0     0     0
 *           scsi-0QEMU_QEMU_HARDDISK_d10          REMOVED      0     0     0
 *           draid2-0-0                            ONLINE       0     0     0
 *       draid2-0-1                                ONLINE       0     0     0
 * spares
 *   draid2-0-0                                    INUSE     currently in use
 *   draid2-0-1                                    INUSE     currently in use
 *
 * ARGS:
 *
 * newvd:  New spare disk
 * pvd:    Parent vdev_t the spare should attach to
 *
 * This function returns B_TRUE if adding the new vdev would create a double
 * spare condition, B_FALSE otherwise.
 */
 static boolean_t
 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd)
 {
 	vdev_t *ppvd;
 	ppvd = pvd->vdev_parent;
 	if (ppvd == NULL)
 		return (B_FALSE);
 	/*
 	 * To determine if this configuration would cause a double spare, we
 	 * look at the vdev_op of the parent vdev, and of the parent's parent
 	 * vdev.  We also look at vdev_isspare on the new disk.  A double spare
 	 * condition looks like this:
 	 *
 	 * 1. parent of parent's op is a spare or draid spare
 	 * 2. parent's op is replacing
 	 * 3. new disk is a spare
 	 */
 	if ((ppvd->vdev_ops == &vdev_spare_ops) ||
 	    (ppvd->vdev_ops == &vdev_draid_spare_ops))
 		if (pvd->vdev_ops == &vdev_replacing_ops)
 			if (newvd->vdev_isspare)
 				return (B_TRUE);
 	return (B_FALSE);
 }
 /*
 * Attach a device to a vdev specified by its guid.  The vdev type can be
 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
@ -7604,6 +7680,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 		if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) {
 			vdev_dbgmsg(newvd,
 			    "disk would create double spares, ignore.");
 			return (spa_vdev_exit(spa, newrootvd, txg, EEXIST));
 		}
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -123,10 +123,10 @@ tags = ['functional', 'fallocate']
 [tests/functional/fault:Linux]
 tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
    'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
-    'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
+    'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple',
-    'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
+    'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault',
-    'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
+    'decompress_fault', 'fault_limits', 'scrub_after_resilver',
-    'suspend_resume_single', 'zpool_status_-s']
+    'suspend_on_probe_errors', 'suspend_resume_single', 'zpool_status_-s']
 tags = ['functional', 'fault']
 [tests/functional/features/large_dnode:Linux]
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@ -1535,6 +1535,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
 	functional/fault/auto_spare_double.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
 	functional/fault/cleanup.ksh \
--- a/tests/zfs-tests/tests/functional/fault/auto_spare_double.ksh
+++ b/tests/zfs-tests/tests/functional/fault/auto_spare_double.ksh
@ -0,0 +1,122 @@
 #!/bin/ksh -p
 # SPDX-License-Identifier: CDDL-1.0
 #
 # CDDL HEADER START
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # CDDL HEADER END
 #
 #
 # Copyright (c) 2025 by Lawrence Livermore National Security, LLC.
 #
 # DESCRIPTION:
 # Try do induce a double spare condition and verify we're prevented from doing
 # it.
 #
 # STRATEGY:
 # 1. Fail a drive
 # 2. Kick in first spare
 # 3. Bring new drive back online and start resilvering to it
 # 4. Immediately after the resilver starts, fail the new drive.
 # 5. Try to kick in the second spare for the new drive (which is now failed)
 # 6. Verify that we can't kick in the second spare
 #
 # Repeat this test for both traditional spares and dRAID spares.
 #
 . $STF_SUITE/include/libtest.shlib
 LAST_VDEV=3
 SIZE_MB=300
 ZED_PID="$(zed_check)"
 function cleanup
 {
 	destroy_pool $TESTPOOL
 	for i in {0..$LAST_VDEV} ; do
 		log_must rm -f $TEST_BASE_DIR/file$i
 	done
 	# Restore ZED if it was running before this test
 	if [ -n $ZED_PID ] ; then
 		log_must zed_start
 	fi
 }
 log_assert "Cannot attach two spares to same failed vdev"
 log_onexit cleanup
 # Stop ZED if it's running
 if [ -n $ZED_PID ] ; then
 	log_must zed_stop
 fi
 log_must truncate -s ${SIZE_MB}M $TEST_BASE_DIR/file{0..$LAST_VDEV}
 ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg
 # Run the test - we assume the pool is already created.
 # $1: disk to fail
 # $2: 1st spare name
 # $3: 2nd spare name
 function do_test {
 	FAIL_DRIVE=$1
 	SPARE0=$2
 	SPARE1=$3
 	echo 0 > $ZFS_DBGMSG
 	log_must zpool status
 	log_note "Kicking in first spare ($SPARE0)"
 	log_must zpool offline -f $TESTPOOL $FAIL_DRIVE
 	log_must zpool replace $TESTPOOL $FAIL_DRIVE $SPARE0
 	# Fill the pool with data to make the resilver take a little
 	# time.
 	dd if=/dev/zero of=/$TESTPOOL/testfile bs=1M || true
 	# Zero our failed disk.  It will appear as a blank.
 	rm -f $FAIL_DRIVE
 	truncate -s ${SIZE_MB}M $FAIL_DRIVE
 	# Attempt to replace our failed disk, then immediately fault it.
 	log_must zpool replace $TESTPOOL $FAIL_DRIVE
 	log_must zpool offline -f $TESTPOOL $FAIL_DRIVE
 	log_must check_state $TESTPOOL $FAIL_DRIVE "faulted"
 	log_note "Kicking in second spare ($SPARE0)... This should not work..."
 	log_mustnot zpool replace $TESTPOOL $FAIL_DRIVE $SPARE1
 	# Verify the debug line in dbgmsg
 	log_must grep 'disk would create double spares' $ZFS_DBGMSG
 	# Disk should still be faulted
 	log_must check_state $TESTPOOL $FAIL_DRIVE "faulted"
 }
 # Test with traditional spares
 log_must zpool create -O compression=off -O recordsize=4k -O primarycache=none \
 	$TESTPOOL mirror $TEST_BASE_DIR/file{0,1} spare $TEST_BASE_DIR/file{2,3}
 do_test $TEST_BASE_DIR/file1 $TEST_BASE_DIR/file2 $TEST_BASE_DIR/file3
 destroy_pool $TESTPOOL
 # Clear vdev files for next test
 for i in {0..$LAST_VDEV} ; do
 	log_must rm -f $TEST_BASE_DIR/file$i
 done
 log_must truncate -s ${SIZE_MB}M $TEST_BASE_DIR/file{0..$LAST_VDEV}
 # Test with dRAID spares
 log_must zpool create -O compression=off -O recordsize=4k -O primarycache=none \
 	$TESTPOOL draid1:1d:4c:2s $TEST_BASE_DIR/file{0..3}
 do_test $TEST_BASE_DIR/file1 draid1-0-0 draid1-0-1
 log_pass "Verified we cannot attach two spares to same failed vdev"