mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Fix snapshot automount expiry cancellation deadlock
A deadlock occurs when snapshot expiry tasks are cancelled while holding locks. The snapshot expiry task (snapentry_expire) spawns an umount process and waits for it to complete. Concurrently, ARC memory pressure triggers arc_prune which calls zfs_exit_fs(), attempting to cancel the expiry task while holding locks. The umount process spawned by the expiry task blocks trying to acquire locks held by arc_prune, which is blocked waiting for the expiry task to complete. This creates a circular dependency: expiry task waits for umount, umount waits for arc_prune, arc_prune waits for expiry task. Fix by adding non-blocking cancellation support to taskq_cancel_id(). The zfs_exit_fs() path calls zfsctl_snapshot_unmount_delay() to reschedule the unmount, which needs to cancel any existing expiry task. It now uses non-blocking cancellation to avoid waiting while holding locks, breaking the deadlock by returning immediately when the task is already running. The per-entry se_taskqid_lock has been removed, with all taskqid operations now protected by the global zfs_snapshot_lock held as WRITER. Additionally, an se_in_umount flag prevents recursive waits when zfsctl_destroy() is called during unmount. The taskqid is now only cleared by the caller on successful cancellation; running tasks clear their own taskqid upon completion. Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #17941
This commit is contained in:
@@ -1452,7 +1452,8 @@ dmu_objset_upgrade_stop(objset_t *os)
|
||||
os->os_upgrade_id = 0;
|
||||
mutex_exit(&os->os_upgrade_lock);
|
||||
|
||||
if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
|
||||
if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id,
|
||||
B_TRUE)) == 0) {
|
||||
dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
|
||||
}
|
||||
txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
|
||||
|
||||
+3
-3
@@ -1934,7 +1934,7 @@ spa_deactivate(spa_t *spa)
|
||||
list_destroy(&spa->spa_evicting_os_list);
|
||||
list_destroy(&spa->spa_state_dirty_list);
|
||||
|
||||
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
|
||||
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
|
||||
|
||||
for (int t = 0; t < ZIO_TYPES; t++) {
|
||||
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
|
||||
@@ -10451,7 +10451,7 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
|
||||
spa->spa_sync_starttime = gethrtime();
|
||||
|
||||
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
|
||||
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
|
||||
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
|
||||
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
|
||||
NSEC_TO_TICK(spa->spa_deadman_synctime));
|
||||
@@ -10508,7 +10508,7 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
spa_sync_rewrite_vdev_config(spa, tx);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
|
||||
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
|
||||
spa->spa_deadman_tqid = 0;
|
||||
|
||||
/*
|
||||
|
||||
+2
-1
@@ -1531,7 +1531,8 @@ zfs_ereport_taskq_fini(void)
|
||||
{
|
||||
mutex_enter(&recent_events_lock);
|
||||
if (recent_events_cleaner_tqid != 0) {
|
||||
taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
|
||||
taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid,
|
||||
B_TRUE);
|
||||
recent_events_cleaner_tqid = 0;
|
||||
}
|
||||
mutex_exit(&recent_events_lock);
|
||||
|
||||
Reference in New Issue
Block a user