zed: Add synchronous zedlets

Historically, ZED has blindly spawned off zedlets in parallel and never
worried about their completion order.  This means that you can
potentially have zedlets for event number 2 starting before zedlets for
event number 1 had finished.  Most of the time this is fine, and it
actually helps a lot when the system is getting spammed with hundreds
of events.

However, there are times when you want your zedlets to be executed
in sequence with the event ID.  That is where synchronous zedlets
come in.

ZED will wait for all previously spawned zedlets to finish before
running a synchronous zedlet.  Synchronous zedlets are guaranteed to be
the only zedlet running.  No other zedlets may run in parallel with a
synchronous zedlet.  Users should be careful to only use synchronous
zedlets when needed, since they decrease parallelism.

To make a zedlet synchronous, simply add a "-sync-" immediately
following the event name in the zedlet's file name:

	EVENT_NAME-sync-ZEDLETNAME.sh

For example, if you wanted a synchronous statechange script:

	statechange-sync-myzedlet.sh

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #17335
This commit is contained in:
Tony Hutter 2025-09-11 11:34:07 -07:00 committed by Brian Behlendorf
parent 3dc345851c
commit 4a7a04630d
15 changed files with 289 additions and 37 deletions

View File

@ -9,18 +9,18 @@ dist_zedexec_SCRIPTS = \
%D%/all-debug.sh \
%D%/all-syslog.sh \
%D%/data-notify.sh \
%D%/deadman-slot_off.sh \
%D%/deadman-sync-slot_off.sh \
%D%/generic-notify.sh \
%D%/pool_import-led.sh \
%D%/pool_import-sync-led.sh \
%D%/resilver_finish-notify.sh \
%D%/resilver_finish-start-scrub.sh \
%D%/scrub_finish-notify.sh \
%D%/statechange-led.sh \
%D%/statechange-sync-led.sh \
%D%/statechange-notify.sh \
%D%/statechange-slot_off.sh \
%D%/statechange-sync-slot_off.sh \
%D%/trim_finish-notify.sh \
%D%/vdev_attach-led.sh \
%D%/vdev_clear-led.sh
%D%/vdev_attach-sync-led.sh \
%D%/vdev_clear-sync-led.sh
nodist_zedexec_SCRIPTS = \
%D%/history_event-zfs-list-cacher.sh
@ -30,17 +30,17 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS)
zedconfdefaults = \
all-syslog.sh \
data-notify.sh \
deadman-slot_off.sh \
deadman-sync-slot_off.sh \
history_event-zfs-list-cacher.sh \
pool_import-led.sh \
pool_import-sync-led.sh \
resilver_finish-notify.sh \
resilver_finish-start-scrub.sh \
scrub_finish-notify.sh \
statechange-led.sh \
statechange-sync-led.sh \
statechange-notify.sh \
statechange-slot_off.sh \
vdev_attach-led.sh \
vdev_clear-led.sh
statechange-sync-slot_off.sh \
vdev_attach-sync-led.sh \
vdev_clear-sync-led.sh
dist_noinst_DATA += %D%/README

View File

@ -1 +0,0 @@
statechange-led.sh

View File

@ -0,0 +1 @@
statechange-sync-led.sh

View File

@ -1 +0,0 @@
statechange-led.sh

View File

@ -0,0 +1 @@
statechange-sync-led.sh

View File

@ -1 +0,0 @@
statechange-led.sh

View File

@ -0,0 +1 @@
statechange-sync-led.sh

View File

@ -196,37 +196,29 @@ _nop(int sig)
(void) sig;
}
static void *
_reap_children(void *arg)
static void
wait_for_children(boolean_t do_pause, boolean_t wait)
{
(void) arg;
struct launched_process_node node, *pnode;
pid_t pid;
int status;
struct rusage usage;
struct sigaction sa = {};
(void) sigfillset(&sa.sa_mask);
(void) sigdelset(&sa.sa_mask, SIGCHLD);
(void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL);
(void) sigemptyset(&sa.sa_mask);
sa.sa_handler = _nop;
sa.sa_flags = SA_NOCLDSTOP;
(void) sigaction(SIGCHLD, &sa, NULL);
int status;
struct launched_process_node node, *pnode;
for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) {
(void) pthread_mutex_lock(&_launched_processes_lock);
pid = wait4(0, &status, WNOHANG, &usage);
pid = wait4(0, &status, wait ? 0 : WNOHANG, &usage);
if (pid == 0 || pid == (pid_t)-1) {
(void) pthread_mutex_unlock(&_launched_processes_lock);
if (pid == 0 || errno == ECHILD)
pause();
else if (errno != EINTR)
if ((pid == 0) || (errno == ECHILD)) {
if (do_pause)
pause();
} else if (errno != EINTR)
zed_log_msg(LOG_WARNING,
"Failed to wait for children: %s",
strerror(errno));
if (!do_pause)
return;
} else {
memset(&node, 0, sizeof (node));
node.pid = pid;
@ -278,6 +270,25 @@ _reap_children(void *arg)
}
}
}
static void *
_reap_children(void *arg)
{
(void) arg;
struct sigaction sa = {};
(void) sigfillset(&sa.sa_mask);
(void) sigdelset(&sa.sa_mask, SIGCHLD);
(void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL);
(void) sigemptyset(&sa.sa_mask);
sa.sa_handler = _nop;
sa.sa_flags = SA_NOCLDSTOP;
(void) sigaction(SIGCHLD, &sa, NULL);
wait_for_children(B_TRUE, B_FALSE);
return (NULL);
}
@ -306,6 +317,45 @@ zed_exec_fini(void)
_reap_children_tid = (pthread_t)-1;
}
/*
* Check if the zedlet name indicates if it is a synchronous zedlet
*
* Synchronous zedlets have a "-sync-" immediately following the event name in
* their zedlet filename, like:
*
* EVENT_NAME-sync-ZEDLETNAME.sh
*
* For example, if you wanted a synchronous statechange script:
*
* statechange-sync-myzedlet.sh
*
* Synchronous zedlets are guaranteed to be the only zedlet running. No other
* zedlets may run in parallel with a synchronous zedlet. A synchronous
* zedlet will wait for all previously spawned zedlets to finish before running.
* Users should be careful to only use synchronous zedlets when needed, since
* they decrease parallelism.
*/
static boolean_t
zedlet_is_sync(const char *zedlet, const char *event)
{
const char *sync_str = "-sync-";
size_t sync_str_len;
size_t zedlet_len;
size_t event_len;
sync_str_len = strlen(sync_str);
zedlet_len = strlen(zedlet);
event_len = strlen(event);
if (event_len + sync_str_len >= zedlet_len)
return (B_FALSE);
if (strncmp(&zedlet[event_len], sync_str, sync_str_len) == 0)
return (B_TRUE);
return (B_FALSE);
}
/*
* Process the event [eid] by synchronously invoking all zedlets with a
* matching class prefix.
@ -368,9 +418,28 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass,
z = zed_strings_next(zcp->zedlets)) {
for (csp = class_strings; *csp; csp++) {
n = strlen(*csp);
if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n]))
if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) {
boolean_t is_sync = zedlet_is_sync(z, *csp);
if (is_sync) {
/*
* Wait for previous zedlets to
* finish
*/
wait_for_children(B_FALSE, B_TRUE);
}
_zed_exec_fork_child(eid, zcp->zedlet_dir,
z, e, zcp->zevent_fd, zcp->do_foreground);
if (is_sync) {
/*
* Wait for sync zedlet we just launched
* to finish.
*/
wait_for_children(B_FALSE, B_TRUE);
}
}
}
}
free(e);

View File

@ -158,6 +158,8 @@ Multiple ZEDLETs may be invoked for a given zevent.
ZEDLETs are executables invoked by the ZED in response to a given zevent.
They should be written under the presumption they can be invoked concurrently,
and they should use appropriate locking to access any shared resources.
The one exception to this are "synchronous zedlets", which are described later
in this page.
Common variables used by ZEDLETs can be stored in the default rc file which
is sourced by scripts; these variables should be prefixed with
.Sy ZED_ .
@ -233,6 +235,36 @@ and
.Sy ZPOOL .
These variables may be overridden in the rc file.
.
.Sh Synchronous ZEDLETS
ZED's normal behavior is to spawn off zedlets in parallel and ignore their
completion order.
This means that ZED can potentially
have zedlets for event ID number 2 starting before zedlets for event ID number
1 have finished.
Most of the time this is fine, and it actually helps when the system is getting
hammered with hundreds of events.
.Pp
However, there are times when you want your zedlets to be executed in sequence
with the event ID.
That is where synchronous zedlets come in.
.Pp
ZED will wait for all previously spawned zedlets to finish before running
a synchronous zedlet.
Synchronous zedlets are guaranteed to be the only
zedlet running.
No other zedlets may run in parallel with a synchronous zedlet.
Users should be careful to only use synchronous zedlets when needed, since
they decrease parallelism.
.Pp
To make a zedlet synchronous, simply add a "-sync-" immediately following the
event name in the zedlet's file name:
.Pp
.Sy EVENT_NAME-sync-ZEDLETNAME.sh
.Pp
For example, if you wanted a synchronous statechange script:
.Pp
.Sy statechange-sync-myzedlet.sh
.
.Sh FILES
.Bl -tag -width "-c"
.It Pa @sysconfdir@/zfs/zed.d

View File

@ -110,7 +110,8 @@ tags = ['functional', 'direct']
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple',
'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', 'slow_vdev_degraded_sit_out']
'zed_synchronous_zedlet', 'slow_vdev_sit_out', 'slow_vdev_sit_out_neg',
'slow_vdev_degraded_sit_out']
tags = ['functional', 'events']
[tests/functional/fallocate:Linux]

View File

@ -1536,6 +1536,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/zed_rc_filter.ksh \
functional/events/zed_slow_io.ksh \
functional/events/zed_slow_io_many_vdevs.ksh \
functional/events/zed_synchronous_zedlet.ksh \
functional/exec/cleanup.ksh \
functional/exec/exec_001_pos.ksh \
functional/exec/exec_002_neg.ksh \

View File

@ -0,0 +1,149 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2025 by Lawrence Livermore National Security, LLC.
#
# DESCRIPTION:
# Verify ZED synchronous zedlets work as expected
#
# STRATEGY:
# 1. Create a scrub_start zedlet that runs quickly
# 2. Create a scrub_start zedlet that runs slowly (takes seconds)
# 3. Create a scrub_finish zedlet that is synchronous and runs slowly
# 4. Create a trim_start zedlet that runs quickly
# 4. Scrub the pool
# 5. Trim the pool
# 6. Verify the synchronous scrub_finish zedlet waited for the scrub_start
# zedlets to finish (including the slow one). If the scrub_finish zedlet
# was not synchronous, it would have completed before the slow scrub_start
# zedlet.
# 7. Verify the trim_start zedlet waited for the slow synchronous scrub_finish
# zedlet to complete.
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/events/events_common.kshlib
verify_runnable "both"
OUR_ZEDLETS="scrub_start-async.sh scrub_start-slow.sh scrub_finish-sync-slow.sh trim_start-async.sh"
OUTFILE="$TEST_BASE_DIR/zed_synchronous_zedlet_lines"
TESTPOOL2=testpool2
function cleanup
{
zed_stop
for i in $OUR_ZEDLETS ; do
log_must rm -f $ZEDLET_DIR/$i
done
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev-file-sync-zedlet
log_must rm -f $OUTFILE
}
log_assert "Verify ZED synchronous zedlets work as expected"
log_onexit cleanup
# Make a pool
log_must truncate -s 100M $TEST_BASE_DIR/vdev-file-sync-zedlet
log_must zpool create $TESTPOOL2 $TEST_BASE_DIR/vdev-file-sync-zedlet
# Do an initial scrub
log_must zpool scrub -w $TESTPOOL2
log_must zpool events -c
mkdir -p $ZEDLET_DIR
# Create zedlets
cat << EOF > $ZEDLET_DIR/scrub_start-async.sh
#!/bin/ksh -p
echo "\$(date) \$(basename \$0)" >> $OUTFILE
EOF
cat << EOF > $ZEDLET_DIR/scrub_start-slow.sh
#!/bin/ksh -p
sleep 3
echo "\$(date) \$(basename \$0)" >> $OUTFILE
EOF
cat << EOF > $ZEDLET_DIR/scrub_finish-sync-slow.sh
#!/bin/ksh -p
sleep 3
echo "\$(date) \$(basename \$0)" >> $OUTFILE
EOF
cat << EOF > $ZEDLET_DIR/trim_start-async.sh
#!/bin/ksh -p
echo "\$(date) \$(basename \$0)" >> $OUTFILE
EOF
for i in $OUR_ZEDLETS ; do
log_must chmod +x $ZEDLET_DIR/$i
done
log_must zed_start
# Do a scrub - it should be instantaneous.
log_must zpool scrub -w $TESTPOOL2
# Start off a trim immediately after scrubiung. The trim should be
# instantaneous and generate a trimp_start event. This will happen in parallel
# with the slow 'scrub_finish-sync-slow.sh' zedlet still running.
log_must zpool trim -w $TESTPOOL2
# Wait for scrub_finish event to happen for sanity. This is the *event*, not
# the completion of zedlets for the event.
log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.trim_finish' 10
# At a minimum, scrub_start-slow.sh + scrub_finish-sync-slow.sh will take a
# total of 6 seconds to run, so wait 7 sec to be sure.
sleep 7
# If our zedlets were run in the right order, with sync correctly honored, you
# will see this ordering in $OUTFILE:
#
# Fri May 16 12:04:23 PDT 2025 scrub_start-async.sh
# Fri May 16 12:04:26 PDT 2025 scrub_start-slow.sh
# Fri May 16 12:04:31 PDT 2025 scrub_finish-sync-slow.sh
# Fri May 16 12:04:31 PDT 2025 trim_start-async.sh
#
# Check for this ordering
# Get a list of just the script names in the order they were executed
# from OUTFILE
lines="$(echo $(grep -Eo '(scrub|trim)_.+\.sh$' $OUTFILE))"
# Compare it to the ordering we expect
expected="\
scrub_start-async.sh \
scrub_start-slow.sh \
scrub_finish-sync-slow.sh \
trim_start-async.sh"
log_must test "$lines" == "$expected"
log_pass "Verified synchronous zedlets"