zpool: Change zpool offline spares policy

The zpool offline man page says that you cannot use 'zpool offline'
on spares.  However, testing found that you could in fact force fault
(zpool offline -f) spares.

Change the policy to:
1. You can never force-fault or offline dRAID spares.
2. You can only force-fault or offline traditional spares if they're
   active.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #18282
This commit is contained in:
Tony Hutter 2026-03-25 11:08:55 -07:00 committed by GitHub
parent 931deb290c
commit b44a3ecf4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 137 additions and 8 deletions

View File

@ -3571,10 +3571,53 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
zfs_cmd_t zc = {"\0"};
char errbuf[ERRBUFLEN];
libzfs_handle_t *hdl = zhp->zpool_hdl;
nvlist_t *vdev_nv;
boolean_t avail_spare, l2cache;
char *vdev_name;
char guid_str[21]; /* 64-bit num + '\0' */
boolean_t is_draid_spare = B_FALSE;
const char *vdev_type;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid);
snprintf(guid_str, sizeof (guid_str), "%llu", (u_longlong_t)guid);
if ((vdev_nv = zpool_find_vdev(zhp, guid_str, &avail_spare,
&l2cache, NULL)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, errbuf));
vdev_name = zpool_vdev_name(hdl, zhp, vdev_nv, 0);
if (vdev_name != NULL) {
/*
* We have the actual vdev name, so use that instead of the GUID
* in any error messages.
*/
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot fault %s"), vdev_name);
free(vdev_name);
}
/*
* Spares (traditional or draid) cannot be faulted by libzfs, except:
*
* - Any spare type that exceeds it's errors can be faulted (aux =
* VDEV_AUX_ERR_EXCEEDED). This is only used by zed.
*
* - Traditional spares that are active can be force faulted.
*/
if (nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_TYPE, &vdev_type) == 0)
if (strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0)
is_draid_spare = B_TRUE;
/*
* If vdev is a spare that is not being used, or is a dRAID spare (in
* use or not), then don't allow it to be force-faulted. However, an
* in-use dRAID spare can be faulted by ZED if see too many errors
* (aux = VDEV_AUX_ERR_EXCEEDED).
*/
if (avail_spare || (is_draid_spare && aux != VDEV_AUX_ERR_EXCEEDED))
return (zfs_error(hdl, EZFS_ISSPARE, errbuf));
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_FAULTED;

View File

@ -56,11 +56,12 @@
.Ar pool
.Ar device Ns
.Xc
Takes the specified physical device offline.
Takes the specified physical device offline or force-fault it.
While the
.Ar device
is offline, no attempt is made to read or write to the device.
This command is not applicable to spares.
is offline or force-faulted, no attempt is made to read or write to the device.
dRAID spares can not be offlined or force faulted.
Traditional spares can only be offlined or force-faulted when they are active.
.Bl -tag -width Ds
.It Fl -power
Power off the device's slot in the storage enclosure.

View File

@ -525,7 +525,7 @@ tags = ['functional', 'cli_root', 'zpool_initialize']
[tests/functional/cli_root/zpool_offline]
tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
'zpool_offline_003_pos']
'zpool_offline_003_pos', 'zpool_offline_spare']
tags = ['functional', 'cli_root', 'zpool_offline']
[tests/functional/cli_root/zpool_online]

View File

@ -323,7 +323,8 @@ pre =
tags = ['functional', 'cli_root', 'zpool_initialize']
[tests/functional/cli_root/zpool_offline]
tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg']
tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
'zpool_offline_spare']
tags = ['functional', 'cli_root', 'zpool_offline']
[tests/functional/cli_root/zpool_online]

View File

@ -1220,6 +1220,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \
functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \
functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \
functional/cli_root/zpool_offline/zpool_offline_spare.ksh \
functional/cli_root/zpool_online/cleanup.ksh \
functional/cli_root/zpool_online/setup.ksh \
functional/cli_root/zpool_online/zpool_online_001_pos.ksh \

View File

@ -0,0 +1,84 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright 2026 by Lawrence Livermore National Security, LLC.
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Verify that traditional spares that are active can be offlined or
# force-faulted. Verify that in all other cases, spares cannot be
# offlined or faulted.
#
# STRATEGY:
# 1. Create pool with traditional spare
# 2. Verify we can't offline and fault an inactive traditional spare
# 3. Verify we can offline and fault an active traditional spare
# 4. Create draid pool with draid spare
# 5. Verify we can't offline/fault draid spare
TESTPOOL2=testpool2
function cleanup
{
destroy_pool $TESTPOOL2
log_must rm -f $TESTDIR/file-vdev-{1..3}
}
log_onexit cleanup
verify_runnable "global"
log_assert "Verify zpool offline has the correct behavior on spares"
# Verify any old file vdevs are gone
log_mustnot ls $TESTDIR/file-vdev-* &> /dev/null
log_must truncate -s 100M $TESTDIR/file-vdev-{1..3}
log_must zpool create $TESTPOOL2 mirror $TESTDIR/file-vdev-1 \
$TESTDIR/file-vdev-2 spare $TESTDIR/file-vdev-3
# Test that we can't offline an inactive spare
log_mustnot zpool offline $TESTPOOL2 $TESTDIR/file-vdev-3
log_mustnot zpool offline -f $TESTPOOL2 $TESTDIR/file-vdev-3
# Test that we can offline an active spare
log_must zpool replace $TESTPOOL2 $TESTDIR/file-vdev-1 $TESTDIR/file-vdev-3
log_must zpool offline $TESTPOOL2 $TESTDIR/file-vdev-3
log_must zpool online $TESTPOOL2 $TESTDIR/file-vdev-3
log_must zpool offline -f $TESTPOOL2 $TESTDIR/file-vdev-3
destroy_pool $TESTPOOL2
log_must zpool create -f $TESTPOOL2 draid1:1d:1s:3c $TESTDIR/file-vdev-{1..3}
# Test that we can't offline an inactive draid spare
log_mustnot zpool offline $TESTPOOL2 draid1-0-0
log_mustnot zpool offline -f $TESTPOOL2 draid1-0-0
# Test that we can't offline an active draid spare
log_must zpool replace $TESTPOOL2 $TESTDIR/file-vdev-1 draid1-0-0
log_mustnot zpool offline $TESTPOOL2 draid1-0-0
log_mustnot zpool offline -f $TESTPOOL2 draid1-0-0
log_pass "zpool offline has the correct behavior on spares"

View File

@ -166,9 +166,8 @@ do
mntpnt=$(get_prop mountpoint /$TESTPOOL)
# 2. Fault the spare device making it unavailable
log_must zpool offline -f $TESTPOOL $sparedev
log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED"
# 2. Remove the spare device making it unavailable
log_must zpool remove $TESTPOOL $sparedev
# 3. Simulate physical removal of one device
remove_disk $removedev