mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Fix snapshot automount expiry cancellation deadlock
A deadlock occurs when snapshot expiry tasks are cancelled while holding locks. The snapshot expiry task (snapentry_expire) spawns an umount process and waits for it to complete. Concurrently, ARC memory pressure triggers arc_prune which calls zfs_exit_fs(), attempting to cancel the expiry task while holding locks. The umount process spawned by the expiry task blocks trying to acquire locks held by arc_prune, which is blocked waiting for the expiry task to complete. This creates a circular dependency: expiry task waits for umount, umount waits for arc_prune, arc_prune waits for expiry task. Fix by adding non-blocking cancellation support to taskq_cancel_id(). The zfs_exit_fs() path calls zfsctl_snapshot_unmount_delay() to reschedule the unmount, which needs to cancel any existing expiry task. It now uses non-blocking cancellation to avoid waiting while holding locks, breaking the deadlock by returning immediately when the task is already running. The per-entry se_taskqid_lock has been removed, with all taskqid operations now protected by the global zfs_snapshot_lock held as WRITER. Additionally, an se_in_umount flag prevents recursive waits when zfsctl_destroy() is called during unmount. The taskqid is now only cleared by the caller on successful cancellation; running tasks clear their own taskqid upon completion. Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #17941
This commit is contained in:
committed by
Brian Behlendorf
parent
663dc86de2
commit
74bbdda1ef
@@ -351,7 +351,7 @@ taskq_free(taskq_ent_t *task)
|
||||
}
|
||||
|
||||
int
|
||||
taskq_cancel_id(taskq_t *tq, taskqid_t tid)
|
||||
taskq_cancel_id(taskq_t *tq, taskqid_t tid, boolean_t wait)
|
||||
{
|
||||
uint32_t pend;
|
||||
int rc;
|
||||
@@ -362,12 +362,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
|
||||
|
||||
if (ent->tqent_type == NORMAL_TASK) {
|
||||
rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
|
||||
if (rc == EBUSY)
|
||||
if (rc == EBUSY && wait)
|
||||
taskqueue_drain(tq->tq_queue, &ent->tqent_task);
|
||||
} else {
|
||||
rc = taskqueue_cancel_timeout(tq->tq_queue,
|
||||
&ent->tqent_timeout_task, &pend);
|
||||
if (rc == EBUSY) {
|
||||
if (rc == EBUSY && wait) {
|
||||
taskqueue_drain_timeout(tq->tq_queue,
|
||||
&ent->tqent_timeout_task);
|
||||
}
|
||||
@@ -381,6 +381,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
|
||||
}
|
||||
/* Free the extra reference we added with taskq_lookup. */
|
||||
taskq_free(ent);
|
||||
|
||||
/*
|
||||
* If task was running and we didn't wait, return EBUSY.
|
||||
* Otherwise return 0 if cancelled or ENOENT if not found.
|
||||
*/
|
||||
if (rc == EBUSY && !wait)
|
||||
return (EBUSY);
|
||||
return (pend ? 0 : ENOENT);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user