Add a statechange notify zedlet

Now that ZED has internal fault diagnosis and the statechange event
is generated for faulted states, we can replace the io-notify and
checksum-notify zedlets with one based on statechange.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@intel.com>
Closes #5383
This commit is contained in:
Don Brady 2016-11-10 14:52:59 -07:00 committed by Brian Behlendorf
parent 32dec7bd1a
commit 0df15db98f
6 changed files with 125 additions and 82 deletions

View File

@ -61,23 +61,21 @@ zedexecdir = $(libexecdir)/zfs/zed.d
dist_zedexec_SCRIPTS = \
zed.d/all-debug.sh \
zed.d/all-syslog.sh \
zed.d/checksum-notify.sh \
zed.d/data-notify.sh \
zed.d/generic-notify.sh \
zed.d/io-notify.sh \
zed.d/resilver_finish-notify.sh \
zed.d/scrub_finish-notify.sh \
zed.d/statechange-led.sh \
zed.d/statechange-notify.sh \
zed.d/vdev_clear-led.sh
zedconfdefaults = \
all-syslog.sh \
checksum-notify.sh \
data-notify.sh \
io-notify.sh \
resilver_finish-notify.sh \
scrub_finish-notify.sh \
statechange-led.sh \
statechange-notify.sh \
vdev_clear-led.sh
install-data-hook:

View File

@ -1 +0,0 @@
io-notify.sh

View File

@ -1 +0,0 @@
io-notify.sh

43
cmd/zed/zed.d/data-notify.sh Executable file
View File

@ -0,0 +1,43 @@
#!/bin/sh
#
# Send notification in response to a DATA error.
#
# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
# class/pool/[vdev] combination. This protects against spamming the recipient
# should multiple events occur together in time for the same pool/[vdev].
#
# Exit codes:
# 0: notification sent
# 1: notification failed
# 2: notification not configured
# 3: notification suppressed
# 9: internal error
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
[ -n "${ZEVENT_POOL}" ] || exit 9
[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
[ -n "${ZED_NOTIFY_DATA}" ] || exit 3
rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify"
zed_rate_limit "${rate_limit_tag}" || exit 3
umask 077
note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)"
note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
{
echo "ZFS has detected a data error:"
echo
echo " eid: ${ZEVENT_EID}"
echo " class: ${ZEVENT_SUBCLASS}"
echo " host: $(hostname)"
echo " time: ${ZEVENT_TIME_STRING}"
echo " error: ${ZEVENT_ZIO_ERR}"
echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}"
echo " pool: ${ZEVENT_POOL}"
} > "${note_pathname}"
zed_notify "${note_subject}" "${note_pathname}"; rv=$?
rm -f "${note_pathname}"
exit "${rv}"

View File

@ -1,64 +0,0 @@
#!/bin/sh
#
# Send notification in response to a CHECKSUM, DATA, or IO error.
#
# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
# class/pool/[vdev] combination. This protects against spamming the recipient
# should multiple events occur together in time for the same pool/[vdev].
#
# Exit codes:
# 0: notification sent
# 1: notification failed
# 2: notification not configured
# 3: notification suppressed
# 9: internal error
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
[ -n "${ZEVENT_POOL}" ] || exit 9
[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
if [ "${ZEVENT_SUBCLASS}" != "checksum" ] \
&& [ "${ZEVENT_SUBCLASS}" != "data" ] \
&& [ "${ZEVENT_SUBCLASS}" != "io" ]; then
zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\""
exit 9
fi
rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify"
zed_rate_limit "${rate_limit_tag}" || exit 3
umask 077
note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)"
note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
{
[ "${ZEVENT_SUBCLASS}" = "io" ] && article="an" || article="a"
echo "ZFS has detected ${article} ${ZEVENT_SUBCLASS} error:"
echo
echo " eid: ${ZEVENT_EID}"
echo " class: ${ZEVENT_SUBCLASS}"
echo " host: $(hostname)"
echo " time: ${ZEVENT_TIME_STRING}"
[ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
[ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
[ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
[ -n "${ZEVENT_VDEV_CKSUM_ERRORS}" ] \
&& echo " cksum: ${ZEVENT_VDEV_CKSUM_ERRORS}"
[ -n "${ZEVENT_VDEV_READ_ERRORS}" ] \
&& echo " read: ${ZEVENT_VDEV_READ_ERRORS}"
[ -n "${ZEVENT_VDEV_WRITE_ERRORS}" ] \
&& echo " write: ${ZEVENT_VDEV_WRITE_ERRORS}"
echo " pool: ${ZEVENT_POOL}"
} > "${note_pathname}"
zed_notify "${note_subject}" "${note_pathname}"; rv=$?
rm -f "${note_pathname}"
exit "${rv}"

View File

@ -0,0 +1,74 @@
#!/bin/sh
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# CDDL HEADER END
#
#
# Send notification in response to a fault induced statechange
#
# ZEVENT_SUBCLASS: 'statechange'
# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED'
#
# Exit codes:
# 0: notification sent
# 1: notification failed
# 2: notification not configured
# 3: statechange not relevant
# 4: statechange string missing (unexpected)
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4
if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \
&& [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \
&& [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then
exit 3
fi
umask 077
note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)"
note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
{
if [ "${ZEVENT_VDEV_STATE_STR}" == "FAULTED" ] ; then
echo "The number of I/O errors associated with a ZFS device exceeded"
echo "acceptable levels. ZFS has marked the device as faulted."
elif [ "${ZEVENT_VDEV_STATE_STR}" == "DEGRADED" ] ; then
echo "The number of checksum errors associated with a ZFS device"
echo "exceeded acceptable levels. ZFS has marked the device as"
echo "degraded."
else
echo "ZFS has detected that a device was removed."
fi
echo
echo " impact: Fault tolerance of the pool may be compromised."
echo " eid: ${ZEVENT_EID}"
echo " class: ${ZEVENT_SUBCLASS}"
echo " state: ${ZEVENT_VDEV_STATE_STR}"
echo " host: $(hostname)"
echo " time: ${ZEVENT_TIME_STRING}"
[ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
[ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
[ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}"
[ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
[ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}"
echo " pool: ${ZEVENT_POOL_GUID}"
} > "${note_pathname}"
zed_notify "${note_subject}" "${note_pathname}"; rv=$?
rm -f "${note_pathname}"
exit "${rv}"

View File

@ -50,6 +50,12 @@
#
#ZED_NOTIFY_VERBOSE=0
##
# Send notifications for 'ereport.fs.zfs.data' events.
# Disabled by default
#
#ZED_NOTIFY_DATA=1
##
# Pushbullet access token.
# This grants full access to your account -- protect it accordingly!
@ -73,18 +79,6 @@
#
#ZED_RUNDIR="/var/run"
##
# Replace a device with a hot spare after N checksum errors are detected.
# Disabled by default; uncomment to enable.
#
#ZED_SPARE_ON_CHECKSUM_ERRORS=10
##
# Replace a device with a hot spare after N I/O errors are detected.
# Disabled by default; uncomment to enable.
#
#ZED_SPARE_ON_IO_ERRORS=1
##
# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for
# device mapper and multipath devices as well. Your enclosure must be