2014-01-25 03:47:46 +04:00
|
|
|
#!/bin/sh
|
|
|
|
#
|
|
|
|
# Replace a device with a hot spare in response to IO or checksum errors.
|
|
|
|
# The following actions will be performed automatically when the number
|
|
|
|
# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
|
|
|
|
# ZED_SPARE_ON_CHECKSUM_ERRORS.
|
|
|
|
#
|
|
|
|
# 1) FAULT the device on IO errors, no futher IO will be attempted.
|
|
|
|
# DEGRADE the device on checksum errors, the device is still
|
|
|
|
# functional and can be used to service IO requests.
|
|
|
|
# 2) Set the SES fault beacon for the device.
|
|
|
|
# 3) Replace the device with a hot spare if any are available.
|
|
|
|
#
|
|
|
|
# Once the hot sparing operation is complete either the failed device or
|
|
|
|
# the hot spare must be manually retired using the 'zpool detach' command.
|
|
|
|
# The 'autoreplace' functionality which would normally take care of this
|
|
|
|
# under Illumos has not yet been implemented.
|
|
|
|
#
|
|
|
|
# Full support for autoreplace is planned, but it requires that the full
|
|
|
|
# ZFS Diagnosis Engine be ported. In the meanwhile this script provides
|
|
|
|
# the majority of the expected hot spare functionality.
|
|
|
|
#
|
|
|
|
# Exit codes:
|
|
|
|
# 0: replaced by hot spare
|
|
|
|
# 1: no hot spare device available
|
|
|
|
# 2: hot sparing disabled
|
|
|
|
# 3: already faulted or degraded
|
|
|
|
# 4: unsupported event class
|
|
|
|
# 5: internal error
|
|
|
|
#
|
2014-09-19 22:10:28 +04:00
|
|
|
test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc"
|
2014-01-25 03:47:46 +04:00
|
|
|
|
|
|
|
test -n "${ZEVENT_POOL}" || exit 5
|
|
|
|
test -n "${ZEVENT_SUBCLASS}" || exit 5
|
|
|
|
test -n "${ZEVENT_VDEV_PATH}" || exit 5
|
|
|
|
test -n "${ZEVENT_VDEV_GUID}" || exit 5
|
|
|
|
|
|
|
|
# Defaults to disabled, enable in the zed.rc file.
|
|
|
|
ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0}
|
|
|
|
ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}
|
|
|
|
|
|
|
|
if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
|
|
|
|
${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
|
|
|
|
exit 2
|
|
|
|
fi
|
|
|
|
|
|
|
|
# A lock file is used to serialize execution.
|
|
|
|
ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock}
|
|
|
|
LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock"
|
|
|
|
|
|
|
|
exec 8> "${LOCKFILE}"
|
|
|
|
flock -x 8
|
|
|
|
|
|
|
|
# Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
|
|
|
|
vdev_status() {
|
|
|
|
local POOL=$1
|
2015-04-10 22:56:21 +03:00
|
|
|
local VDEV=`basename $2`
|
2014-09-12 02:41:35 +04:00
|
|
|
local T=' ' # tab character since '\t' isn't portable
|
2014-01-25 03:47:46 +04:00
|
|
|
|
2014-09-12 02:41:35 +04:00
|
|
|
${ZPOOL} status ${POOL} | sed -n -e \
|
|
|
|
"s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p"
|
2014-01-25 03:47:46 +04:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
# Fault devices after N I/O errors.
|
|
|
|
if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
|
|
|
|
ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`
|
|
|
|
|
|
|
|
if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
|
|
|
|
${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
|
|
|
|
ACTION="fault"
|
|
|
|
fi
|
|
|
|
# Degrade devices after N checksum errors.
|
|
|
|
elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
|
|
|
|
ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS}
|
|
|
|
|
|
|
|
if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
|
|
|
|
${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
|
|
|
|
ACTION="degrade"
|
|
|
|
fi
|
|
|
|
else
|
|
|
|
ACTION=
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ -n "${ACTION}" ]; then
|
|
|
|
|
|
|
|
# Device is already FAULTED or DEGRADED
|
|
|
|
set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
|
|
|
|
ZEVENT_VDEV_PATH_FOUND=$1
|
|
|
|
STATUS=$2
|
|
|
|
if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
|
|
|
|
exit 3
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Step 1) FAULT or DEGRADE the device
|
|
|
|
#
|
|
|
|
${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}
|
|
|
|
|
|
|
|
# Step 2) Set the SES fault beacon.
|
|
|
|
#
|
|
|
|
# XXX: Set the 'fault' or 'ident' beacon for the device. This can
|
|
|
|
# be done through the sg_ses utility, the only hard part is to map
|
|
|
|
# the sd device to its corresponding enclosure and slot. We may
|
|
|
|
# be able to leverage the existing vdev_id scripts for this.
|
|
|
|
#
|
|
|
|
# $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
|
|
|
|
# $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
|
|
|
|
|
|
|
|
# Step 3) Replace the device with a hot spare.
|
|
|
|
#
|
|
|
|
# Round robin through the spares selecting those which are available.
|
|
|
|
#
|
|
|
|
for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do
|
|
|
|
set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
|
|
|
|
SPARE_VDEV_FOUND=$1
|
|
|
|
STATUS=$2
|
|
|
|
if [ "${STATUS}" = "AVAIL" ]; then
|
|
|
|
${ZPOOL} replace ${ZEVENT_POOL} \
|
|
|
|
${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
exit 4
|