From d9daa7abcf04f75ba013ec954c4f2d4854ba1cbc Mon Sep 17 00:00:00 2001 From: David Quigley Date: Mon, 23 Oct 2017 12:42:37 -0600 Subject: [PATCH] ZTS: Add auto-spare tests The ZED is expected to automatically kick in a hot spare device when there's one available in the pool and a sufficient number of read errors have been encountered. Use zinject to simulate the failure condition and verify the hot spare is used. auto_spare_001_pos.ksh: read IO errors, the vdev is FAULTED auto_spare_002_pos.ksh: read CHECKSUM errors, the vdev is DEGRADE Reviewed by: Richard Elling Reviewed-by: Brian Behlendorf Signed-off-by: David Quigley Closes #6280 --- tests/runfiles/linux.run | 3 +- tests/zfs-tests/include/libtest.shlib | 50 ++++++++++ .../tests/functional/fault/Makefile.am | 4 +- .../functional/fault/auto_spare_001_pos.ksh | 91 +++++++++++++++++++ .../functional/fault/auto_spare_002_pos.ksh | 90 ++++++++++++++++++ .../tests/functional/fault/fault.cfg | 5 + 6 files changed, 241 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 47ddc6bc3..6fe6b6588 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -381,7 +381,8 @@ tests = ['events_001_pos', 'events_002_pos'] tests = ['exec_001_pos', 'exec_002_neg'] [tests/functional/fault] -tests = ['auto_online_001_pos', 'auto_replace_001_pos'] +tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', + 'auto_spare_002_pos.ksh'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index eef678455..0e7f20f0e 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2029,6 +2029,31 @@ function check_hotspare_state # pool disk state{inuse,avail} return 0 } +# +# Wait until a hotspare transitions to a given state or times out. +# +# Return 0 when pool/disk matches expected state, 1 on timeout. +# +function wait_hotspare_state # pool disk state timeout +{ + typeset pool=$1 + typeset disk=${2#$/DEV_DSKDIR/} + typeset state=$3 + typeset timeout=${4:-60} + typeset -i i=0 + + while [[ $i -lt $timeout ]]; do + if check_hotspare_state $pool $disk $state; then + return 0 + fi + + i=$((i+1)) + sleep 1 + done + + return 1 +} + # # Verify a given slog disk is inuse or avail # @@ -2067,6 +2092,31 @@ function check_vdev_state # pool disk state{online,offline,unavail} return 0 } +# +# Wait until a vdev transitions to a given state or times out. +# +# Return 0 when pool/disk matches expected state, 1 on timeout. +# +function wait_vdev_state # pool disk state timeout +{ + typeset pool=$1 + typeset disk=${2#$/DEV_DSKDIR/} + typeset state=$3 + typeset timeout=${4:-60} + typeset -i i=0 + + while [[ $i -lt $timeout ]]; do + if check_vdev_state $pool $disk $state; then + return 0 + fi + + i=$((i+1)) + sleep 1 + done + + return 1 +} + # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index eeff31261..436f3e8be 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -4,4 +4,6 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ auto_online_001_pos.ksh \ - auto_replace_001_pos.ksh + auto_replace_001_pos.ksh \ + auto_spare_001_pos.ksh \ + auto_spare_002_pos.ksh diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh new file mode 100755 index 000000000..82f7f4834 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Intel Corporation. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when +# drive is faulted due to IO ERRORS. +# +# STRATEGY: +# 1. Create a pool with hot spares +# 2. Create a filesystem with the primary cache disable to force reads +# 3. Write a file to the pool to be read back +# 4. Inject IO ERRORS on read with a zinject error handler +# 5. Verify the ZED kicks in a hot spare and expected pool/device status +# 6. Clear the fault +# 7. Verify the hot spare is available and expected pool/device status +# + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -f $VDEV_FILES $SPARE_FILE +} + +log_assert "Testing automated auto-spare FMA test" + +log_onexit cleanup + +TESTFILE="/$TESTPOOL/$TESTFS/testfile" + +for type in "mirror" "raidz" "raidz2"; do + # 1. Create a pool with hot spares + truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE + + # 2. Create a filesystem with the primary cache disable to force reads + log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS + log_must zfs set recordsize=16k $TESTPOOL/$TESTFS + + # 3. Write a file to the pool to be read back + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + + # 4. Inject IO ERRORS on read with a zinject error handler + log_must zinject -d $FAULT_FILE -e io -T read $TESTPOOL + log_must cp $TESTFILE /dev/null + + # 5. Verify the ZED kicks in a hot spare and expected pool/device status + log_note "Wait for ZED to auto-spare" + log_must wait_vdev_state $TESTPOOL $FAULT_FILE "FAULTED" 60 + log_must wait_vdev_state $TESTPOOL $SPARE_FILE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "INUSE" + log_must check_state $TESTPOOL "" "DEGRADED" + + # 6. Clear the fault + log_must zinject -c all + log_must zpool clear $TESTPOOL $FAULT_FILE + + # 7. Verify the hot spare is available and expected pool/device status + log_must wait_vdev_state $TESTPOOL $FAULT_FILE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "AVAIL" + log_must is_pool_resilvered $TESTPOOL + log_must check_state $TESTPOOL "" "ONLINE" + + cleanup +done + +log_pass "Auto-spare test successful" diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh new file mode 100755 index 000000000..f0ddac35c --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Intel Corporation. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when +# drive is faulted due to CHECKSUM ERRORS. +# +# STRATEGY: +# 1. Create a pool with hot spares +# 2. Create a filesystem with the primary cache disable to force reads +# 3. Write a file to the pool to be read back +# 4. Inject CHECKSUM ERRORS on read with a zinject error handler +# 5. Verify the ZED kicks in a hot spare and expected pool/device status +# 6. Clear the fault +# 7. Verify the hot spare is available and expected pool/device status +# + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + poolexists $TESTPOOL && destroy_pool $TESTPOOL + rm -f $VDEV_FILES $SPARE_FILE +} + +log_assert "Testing automated auto-spare FMA test" + +log_onexit cleanup + +TESTFILE="/$TESTPOOL/$TESTFS/testfile" + +for type in "mirror" "raidz" "raidz2"; do + # 1. Create a pool with hot spares + truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE + + # 2. Create a filesystem with the primary cache disable to force reads + log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS + log_must zfs set recordsize=16k $TESTPOOL/$TESTFS + + # 3. Write a file to the pool to be read back + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + + # 4. Inject CHECKSUM ERRORS on read with a zinject error handler + log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL + log_must cp $TESTFILE /dev/null + + # 5. Verify the ZED kicks in a hot spare and expected pool/device status + log_note "Wait for ZED to auto-spare" + log_must wait_vdev_state $TESTPOOL $FAULT_FILE "DEGRADED" 60 + log_must wait_vdev_state $TESTPOOL $SPARE_FILE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "INUSE" + log_must check_state $TESTPOOL "" "DEGRADED" + + # 6. Clear the fault + log_must zinject -c all + log_must zpool clear $TESTPOOL $FAULT_FILE + + # 7. Verify the hot spare is available and expected pool/device status + log_must wait_vdev_state $TESTPOOL $FAULT_FILE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "AVAIL" + log_must check_state $TESTPOOL "" "ONLINE" + + cleanup +done + +log_pass "Auto-spare test successful" diff --git a/tests/zfs-tests/tests/functional/fault/fault.cfg b/tests/zfs-tests/tests/functional/fault/fault.cfg index e6e4fe582..16a4fb835 100644 --- a/tests/zfs-tests/tests/functional/fault/fault.cfg +++ b/tests/zfs-tests/tests/functional/fault/fault.cfg @@ -51,3 +51,8 @@ if is_linux; then else DEV_DSKDIR="/dev" fi + +export VDEV_FILES="$TEST_BASE_DIR/file-1 $TEST_BASE_DIR/file-2 \ + $TEST_BASE_DIR/file-3 $TEST_BASE_DIR/file-4" +export SPARE_FILE="$TEST_BASE_DIR/spare-1" +export FAULT_FILE="$TEST_BASE_DIR/file-1"