mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
OpenZFS 9425 - channel programs can be interrupted
Problem Statement ================= ZFS Channel program scripts currently require a timeout, so that hung or long-running scripts return a timeout error instead of causing ZFS to get wedged. This limit can currently be set up to 100 million Lua instructions. Even with a limit in place, it would be desirable to have a sys admin (support engineer) be able to cancel a script that is taking a long time. Proposed Solution ================= Make it possible to abort a channel program by sending an interrupt signal.In the underlying txg_wait_sync function, switch the cv_wait to a cv_wait_sig to catch the signal. Once a signal is encountered, the dsl_sync_task function can install a Lua hook that will get called before the Lua interpreter executes a new line of code. The dsl_sync_task can resume with a standard txg_wait_sync call and wait for the txg to complete. Meanwhile, the hook will abort the script and indicate that the channel program was canceled. The kernel returns a EINTR to indicate that the channel program run was canceled. Porting notes: Added missing return value from cv_wait_sig() Authored by: Don Brady <don.brady@delphix.com> Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Sara Hartse <sara.hartse@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Robert Mustacchi <rm@joyent.com> Ported-by: Don Brady <don.brady@delphix.com> Signed-off-by: Don Brady <don.brady@delphix.com> OpenZFS-issue: https://www.illumos.org/issues/9425 OpenZFS-commit: https://github.com/illumos/illumos-gate/commit/d0cb1fb926 Closes #8904
This commit is contained in:
@@ -87,7 +87,7 @@ tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
|
||||
'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict',
|
||||
'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult',
|
||||
'tst.rollback_one', 'tst.snapshot_destroy', 'tst.snapshot_neg',
|
||||
'tst.snapshot_recursive', 'tst.snapshot_simple']
|
||||
'tst.snapshot_recursive', 'tst.snapshot_simple', 'tst.terminate_by_signal']
|
||||
tags = ['functional', 'channel_program', 'synctask_core']
|
||||
|
||||
[tests/functional/chattr]
|
||||
|
||||
@@ -27,7 +27,8 @@ dist_pkgdata_SCRIPTS = \
|
||||
tst.snapshot_destroy.ksh \
|
||||
tst.snapshot_neg.ksh \
|
||||
tst.snapshot_recursive.ksh \
|
||||
tst.snapshot_simple.ksh
|
||||
tst.snapshot_simple.ksh \
|
||||
tst.terminate_by_signal.ksh
|
||||
|
||||
dist_pkgdata_DATA = \
|
||||
tst.get_index_props.out \
|
||||
|
||||
Executable
+98
@@ -0,0 +1,98 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2017 by Delphix. All rights reserved.
|
||||
#
|
||||
. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION: Execute a long-running zfs channel program and attempt to
|
||||
# cancel it by sending a signal.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
rootfs=$TESTPOOL/$TESTFS
|
||||
snapname=snap
|
||||
limit=50000000
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $rootfs && log_must zfs destroy -R $rootfs
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
#
|
||||
# Create a working set of 100 file systems
|
||||
#
|
||||
for i in {1..100}; do
|
||||
log_must zfs create "$rootfs/child$i"
|
||||
done
|
||||
|
||||
#
|
||||
# Attempt to create 100 snapshots with zfs.sync.snapshot() along with some
|
||||
# time consuming efforts. We use loops of zfs.check.* (dry run operations)
|
||||
# to consume instructions before the next zfs.sync.snapshot() occurs.
|
||||
#
|
||||
# Without a signal interruption this ZCP would take several minutes and
|
||||
# generate over 30 million Lua instructions.
|
||||
#
|
||||
function chan_prog
|
||||
{
|
||||
zfs program -t $limit $TESTPOOL - $rootfs $snapname <<-EOF
|
||||
arg = ...
|
||||
fs = arg["argv"][1]
|
||||
snap = arg["argv"][2]
|
||||
for child in zfs.list.children(fs) do
|
||||
local snapname = child .. "@" .. snap
|
||||
zfs.check.snapshot(snapname)
|
||||
zfs.sync.snapshot(snapname)
|
||||
for i=1,20000,1 do
|
||||
zfs.check.snapshot(snapname)
|
||||
zfs.check.destroy(snapname)
|
||||
zfs.check.destroy(fs)
|
||||
end
|
||||
end
|
||||
return "should not have reached here"
|
||||
EOF
|
||||
}
|
||||
|
||||
log_note "Executing a long-running zfs program in the background"
|
||||
chan_prog &
|
||||
CHILD=$!
|
||||
|
||||
#
|
||||
# After waiting, send a kill signal to the channel program process.
|
||||
# This should stop the ZCP near a million instructions but still have
|
||||
# created some of the snapshots. Note that since the above zfs program
|
||||
# command might get wrapped, we also issue a kill to the group.
|
||||
#
|
||||
sleep 10
|
||||
log_pos pkill -P $CHILD
|
||||
log_pos kill $CHILD
|
||||
|
||||
#
|
||||
# Make sure the channel program did not fully complete by enforcing
|
||||
# that not all of the snapshots were created.
|
||||
#
|
||||
snap_count=$(zfs list -t snapshot | grep $TESTPOOL | wc -l)
|
||||
log_note "$snap_count snapshots created by ZCP"
|
||||
|
||||
if [ "$snap_count" -eq 0 ]; then
|
||||
log_fail "Channel progam failed to run."
|
||||
elif [ "$snap_count" -gt 50 ]; then
|
||||
log_fail "Too many snapshots after a cancel ($snap_count)."
|
||||
else
|
||||
log_pass "Canceling a long-running channel program works."
|
||||
fi
|
||||
Reference in New Issue
Block a user