mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-02-12 18:31:10 +03:00
![Chris Dunlap](/assets/img/avatar_default.png)
The "zpool status" output shows the full pathname for file-type vdevs,
but only the basename component for disk-type vdevs. In commit
bee6665
, the "basename" command was dropped from altering the vdev
name used when searching the "zpool status" output. Consequently,
hot-disk sparing for disk vdevs broke since "zpool status" output
was now being searched for the full pathname to the disk vdev.
Parsing the "zpool status" output in this manner is rather brittle.
It would be preferable to search for the vdev based on its guid.
But until that happens, this commit adds back the "basename" command
to fix the vdev name breakage.
Signed-off-by: Chris Dunlap <cdunlap@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3310
127 lines
3.8 KiB
Bash
Executable File
127 lines
3.8 KiB
Bash
Executable File
#!/bin/sh
|
|
#
|
|
# Replace a device with a hot spare in response to IO or checksum errors.
|
|
# The following actions will be performed automatically when the number
|
|
# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
|
|
# ZED_SPARE_ON_CHECKSUM_ERRORS.
|
|
#
|
|
# 1) FAULT the device on IO errors, no futher IO will be attempted.
|
|
# DEGRADE the device on checksum errors, the device is still
|
|
# functional and can be used to service IO requests.
|
|
# 2) Set the SES fault beacon for the device.
|
|
# 3) Replace the device with a hot spare if any are available.
|
|
#
|
|
# Once the hot sparing operation is complete either the failed device or
|
|
# the hot spare must be manually retired using the 'zpool detach' command.
|
|
# The 'autoreplace' functionality which would normally take care of this
|
|
# under Illumos has not yet been implemented.
|
|
#
|
|
# Full support for autoreplace is planned, but it requires that the full
|
|
# ZFS Diagnosis Engine be ported. In the meanwhile this script provides
|
|
# the majority of the expected hot spare functionality.
|
|
#
|
|
# Exit codes:
|
|
# 0: replaced by hot spare
|
|
# 1: no hot spare device available
|
|
# 2: hot sparing disabled
|
|
# 3: already faulted or degraded
|
|
# 4: unsupported event class
|
|
# 5: internal error
|
|
#
|
|
test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc"
|
|
|
|
test -n "${ZEVENT_POOL}" || exit 5
|
|
test -n "${ZEVENT_SUBCLASS}" || exit 5
|
|
test -n "${ZEVENT_VDEV_PATH}" || exit 5
|
|
test -n "${ZEVENT_VDEV_GUID}" || exit 5
|
|
|
|
# Defaults to disabled, enable in the zed.rc file.
|
|
ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0}
|
|
ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}
|
|
|
|
if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
|
|
${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
|
|
exit 2
|
|
fi
|
|
|
|
# A lock file is used to serialize execution.
|
|
ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock}
|
|
LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock"
|
|
|
|
exec 8> "${LOCKFILE}"
|
|
flock -x 8
|
|
|
|
# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
|
|
vdev_status() {
|
|
local POOL=$1
|
|
local VDEV=`basename $2`
|
|
local T=' ' # tab character since '\t' isn't portable
|
|
|
|
${ZPOOL} status ${POOL} | sed -n -e \
|
|
"s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p"
|
|
return 0
|
|
}
|
|
|
|
# Fault devices after N I/O errors.
|
|
if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
|
|
ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`
|
|
|
|
if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
|
|
${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
|
|
ACTION="fault"
|
|
fi
|
|
# Degrade devices after N checksum errors.
|
|
elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
|
|
ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS}
|
|
|
|
if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
|
|
${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
|
|
ACTION="degrade"
|
|
fi
|
|
else
|
|
ACTION=
|
|
fi
|
|
|
|
if [ -n "${ACTION}" ]; then
|
|
|
|
# Device is already FAULTED or DEGRADED
|
|
set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
|
|
ZEVENT_VDEV_PATH_FOUND=$1
|
|
STATUS=$2
|
|
if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
|
|
exit 3
|
|
fi
|
|
|
|
# Step 1) FAULT or DEGRADE the device
|
|
#
|
|
${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}
|
|
|
|
# Step 2) Set the SES fault beacon.
|
|
#
|
|
# XXX: Set the 'fault' or 'ident' beacon for the device. This can
|
|
# be done through the sg_ses utility, the only hard part is to map
|
|
# the sd device to its corresponding enclosure and slot. We may
|
|
# be able to leverage the existing vdev_id scripts for this.
|
|
#
|
|
# $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
|
|
# $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
|
|
|
|
# Step 3) Replace the device with a hot spare.
|
|
#
|
|
# Round robin through the spares selecting those which are available.
|
|
#
|
|
for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do
|
|
set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
|
|
SPARE_VDEV_FOUND=$1
|
|
STATUS=$2
|
|
if [ "${STATUS}" = "AVAIL" ]; then
|
|
${ZPOOL} replace ${ZEVENT_POOL} \
|
|
${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0
|
|
fi
|
|
done
|
|
|
|
exit 1
|
|
fi
|
|
|
|
exit 4
|