Fix snapshot automount expiry cancellation deadlock

A deadlock occurs when snapshot expiry tasks are cancelled while holding
locks. The snapshot expiry task (snapentry_expire) spawns an umount
process and waits for it to complete. Concurrently, ARC memory pressure
triggers arc_prune which calls zfs_exit_fs(), attempting to cancel the
expiry task while holding locks. The umount process spawned by the
expiry task blocks trying to acquire locks held by arc_prune, which is
blocked waiting for the expiry task to complete. This creates a circular
dependency: expiry task waits for umount, umount waits for arc_prune,
arc_prune waits for expiry task.

Fix by adding non-blocking cancellation support to taskq_cancel_id().
The zfs_exit_fs() path calls zfsctl_snapshot_unmount_delay() to
reschedule the unmount, which needs to cancel any existing expiry task.
It now uses non-blocking cancellation to avoid waiting while holding
locks, breaking the deadlock by returning immediately when the task is
already running.

The per-entry se_taskqid_lock has been removed, with all taskqid
operations now protected by the global zfs_snapshot_lock held as
WRITER. Additionally, an se_in_umount flag prevents recursive waits when
zfsctl_destroy() is called during unmount. The taskqid is now only
cleared by the caller on successful cancellation; running tasks clear
their own taskqid upon completion.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #17941
This commit is contained in:
Ameer Hamza 2025-12-02 03:43:42 +05:00 committed by Brian Behlendorf
parent 663dc86de2
commit 74bbdda1ef
12 changed files with 69 additions and 48 deletions

View File

@ -107,7 +107,7 @@ extern void taskq_destroy(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_id(taskq_t *, taskqid_t);
extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
extern void taskq_wait(taskq_t *); extern void taskq_wait(taskq_t *);
extern int taskq_cancel_id(taskq_t *, taskqid_t); extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
extern int taskq_member(taskq_t *, kthread_t *); extern int taskq_member(taskq_t *, kthread_t *);
extern taskq_t *taskq_of_curthread(void); extern taskq_t *taskq_of_curthread(void);
void taskq_suspend(taskq_t *); void taskq_suspend(taskq_t *);

View File

@ -198,7 +198,7 @@ extern void taskq_destroy(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_id(taskq_t *, taskqid_t);
extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
extern void taskq_wait(taskq_t *); extern void taskq_wait(taskq_t *);
extern int taskq_cancel_id(taskq_t *, taskqid_t); extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
extern int taskq_member(taskq_t *, kthread_t *); extern int taskq_member(taskq_t *, kthread_t *);
extern taskq_t *taskq_of_curthread(void); extern taskq_t *taskq_of_curthread(void);

View File

@ -509,7 +509,7 @@ extern void taskq_wait_id(taskq_t *, taskqid_t);
extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
extern int taskq_member(taskq_t *, kthread_t *); extern int taskq_member(taskq_t *, kthread_t *);
extern taskq_t *taskq_of_curthread(void); extern taskq_t *taskq_of_curthread(void);
extern int taskq_cancel_id(taskq_t *, taskqid_t); extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
extern void system_taskq_init(void); extern void system_taskq_init(void);
extern void system_taskq_fini(void); extern void system_taskq_fini(void);

View File

@ -390,9 +390,9 @@ taskq_of_curthread(void)
} }
int int
taskq_cancel_id(taskq_t *tq, taskqid_t id) taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
{ {
(void) tq, (void) id; (void) tq, (void) id, (void) wait;
return (ENOENT); return (ENOENT);
} }

View File

@ -351,7 +351,7 @@ taskq_free(taskq_ent_t *task)
} }
int int
taskq_cancel_id(taskq_t *tq, taskqid_t tid) taskq_cancel_id(taskq_t *tq, taskqid_t tid, boolean_t wait)
{ {
uint32_t pend; uint32_t pend;
int rc; int rc;
@ -362,12 +362,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
if (ent->tqent_type == NORMAL_TASK) { if (ent->tqent_type == NORMAL_TASK) {
rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
if (rc == EBUSY) if (rc == EBUSY && wait)
taskqueue_drain(tq->tq_queue, &ent->tqent_task); taskqueue_drain(tq->tq_queue, &ent->tqent_task);
} else { } else {
rc = taskqueue_cancel_timeout(tq->tq_queue, rc = taskqueue_cancel_timeout(tq->tq_queue,
&ent->tqent_timeout_task, &pend); &ent->tqent_timeout_task, &pend);
if (rc == EBUSY) { if (rc == EBUSY && wait) {
taskqueue_drain_timeout(tq->tq_queue, taskqueue_drain_timeout(tq->tq_queue,
&ent->tqent_timeout_task); &ent->tqent_timeout_task);
} }
@ -381,6 +381,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
} }
/* Free the extra reference we added with taskq_lookup. */ /* Free the extra reference we added with taskq_lookup. */
taskq_free(ent); taskq_free(ent);
/*
* If task was running and we didn't wait, return EBUSY.
* Otherwise return 0 if cancelled or ENOENT if not found.
*/
if (rc == EBUSY && !wait)
return (EBUSY);
return (pend ? 0 : ENOENT); return (pend ? 0 : ENOENT);
} }

View File

@ -840,7 +840,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
id = skc->skc_taskqid; id = skc->skc_taskqid;
spin_unlock(&skc->skc_lock); spin_unlock(&skc->skc_lock);
taskq_cancel_id(spl_kmem_cache_taskq, id); taskq_cancel_id(spl_kmem_cache_taskq, id, B_TRUE);
/* /*
* Wait until all current callers complete, this is mainly * Wait until all current callers complete, this is mainly

View File

@ -600,13 +600,22 @@ taskq_of_curthread(void)
EXPORT_SYMBOL(taskq_of_curthread); EXPORT_SYMBOL(taskq_of_curthread);
/* /*
* Cancel an already dispatched task given the task id. Still pending tasks * Cancel a dispatched task. Pending tasks are cancelled immediately.
* will be immediately canceled, and if the task is active the function will * If the task is running, behavior depends on wait parameter:
* block until it completes. Preallocated tasks which are canceled must be * - wait=B_TRUE: Block until task completes
* freed by the caller. * - wait=B_FALSE: Return EBUSY immediately
*
* Return values:
* 0 - Cancelled before execution. Caller must release resources.
* EBUSY - Task running (wait=B_FALSE only). Will self-cleanup.
* ENOENT - Not found, or completed after waiting. Already cleaned up.
*
* Note: wait=B_TRUE returns ENOENT (not EBUSY) after waiting because
* the task no longer exists. This distinguishes "cancelled before run"
* from "completed naturally" for proper resource management.
*/ */
int int
taskq_cancel_id(taskq_t *tq, taskqid_t id) taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
{ {
taskq_ent_t *t; taskq_ent_t *t;
int rc = ENOENT; int rc = ENOENT;
@ -669,8 +678,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
spin_unlock_irqrestore(&tq->tq_lock, flags); spin_unlock_irqrestore(&tq->tq_lock, flags);
if (t == ERR_PTR(-EBUSY)) { if (t == ERR_PTR(-EBUSY)) {
taskq_wait_id(tq, id); if (wait) {
rc = EBUSY; taskq_wait_id(tq, id);
rc = ENOENT; /* Completed, no longer exists */
} else {
rc = EBUSY; /* Still running */
}
} }
return (rc); return (rc);

View File

@ -120,7 +120,6 @@ typedef struct {
spa_t *se_spa; /* pool spa */ spa_t *se_spa; /* pool spa */
uint64_t se_objsetid; /* snapshot objset id */ uint64_t se_objsetid; /* snapshot objset id */
struct dentry *se_root_dentry; /* snapshot root dentry */ struct dentry *se_root_dentry; /* snapshot root dentry */
krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */
taskqid_t se_taskqid; /* scheduled unmount taskqid */ taskqid_t se_taskqid; /* scheduled unmount taskqid */
avl_node_t se_node_name; /* zfs_snapshots_by_name link */ avl_node_t se_node_name; /* zfs_snapshots_by_name link */
avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
@ -147,7 +146,6 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
se->se_objsetid = objsetid; se->se_objsetid = objsetid;
se->se_root_dentry = root_dentry; se->se_root_dentry = root_dentry;
se->se_taskqid = TASKQID_INVALID; se->se_taskqid = TASKQID_INVALID;
rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);
zfs_refcount_create(&se->se_refcount); zfs_refcount_create(&se->se_refcount);
@ -164,7 +162,6 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
zfs_refcount_destroy(&se->se_refcount); zfs_refcount_destroy(&se->se_refcount);
kmem_strfree(se->se_name); kmem_strfree(se->se_name);
kmem_strfree(se->se_path); kmem_strfree(se->se_path);
rw_destroy(&se->se_taskqid_lock);
kmem_free(se, sizeof (zfs_snapentry_t)); kmem_free(se, sizeof (zfs_snapentry_t));
} }
@ -340,17 +337,15 @@ snapentry_expire(void *data)
return; return;
} }
rw_enter(&se->se_taskqid_lock, RW_WRITER);
se->se_taskqid = TASKQID_INVALID;
rw_exit(&se->se_taskqid_lock);
(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
zfsctl_snapshot_rele(se);
/* /*
* Reschedule the unmount if the zfs_snapentry_t wasn't removed. * Clear taskqid and reschedule if the snapshot wasn't removed.
* This can occur when the snapshot is busy. * This can occur when the snapshot is busy.
*/ */
rw_enter(&zfs_snapshot_lock, RW_READER); rw_enter(&zfs_snapshot_lock, RW_WRITER);
se->se_taskqid = TASKQID_INVALID;
zfsctl_snapshot_rele(se);
if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
zfsctl_snapshot_rele(se); zfsctl_snapshot_rele(se);
@ -367,17 +362,17 @@ static void
zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
{ {
int err = 0; int err = 0;
rw_enter(&se->se_taskqid_lock, RW_WRITER);
err = taskq_cancel_id(system_delay_taskq, se->se_taskqid); ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
err = taskq_cancel_id(system_delay_taskq, se->se_taskqid, B_FALSE);
/* /*
* if we get ENOENT, the taskq couldn't be found to be * Clear taskqid only if we successfully cancelled before execution.
* canceled, so we can just mark it as invalid because * For ENOENT, task already cleared it. For EBUSY, task will clear
* it's already gone. If we got EBUSY, then we already * it when done.
* blocked until it was gone _anyway_, so we don't care.
*/ */
se->se_taskqid = TASKQID_INVALID;
rw_exit(&se->se_taskqid_lock);
if (err == 0) { if (err == 0) {
se->se_taskqid = TASKQID_INVALID;
zfsctl_snapshot_rele(se); zfsctl_snapshot_rele(se);
} }
} }
@ -388,12 +383,11 @@ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
static void static void
zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
{ {
ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
if (delay <= 0) if (delay <= 0)
return; return;
zfsctl_snapshot_hold(se);
rw_enter(&se->se_taskqid_lock, RW_WRITER);
/* /*
* If this condition happens, we managed to: * If this condition happens, we managed to:
* - dispatch once * - dispatch once
@ -404,13 +398,12 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
* no problem. * no problem.
*/ */
if (se->se_taskqid != TASKQID_INVALID) { if (se->se_taskqid != TASKQID_INVALID) {
rw_exit(&se->se_taskqid_lock);
zfsctl_snapshot_rele(se);
return; return;
} }
zfsctl_snapshot_hold(se);
se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
rw_exit(&se->se_taskqid_lock);
} }
/* /*
@ -425,7 +418,7 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
zfs_snapentry_t *se; zfs_snapentry_t *se;
int error = ENOENT; int error = ENOENT;
rw_enter(&zfs_snapshot_lock, RW_READER); rw_enter(&zfs_snapshot_lock, RW_WRITER);
if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_cancel(se);
zfsctl_snapshot_unmount_delay_impl(se, delay); zfsctl_snapshot_unmount_delay_impl(se, delay);
@ -614,13 +607,18 @@ zfsctl_destroy(zfsvfs_t *zfsvfs)
rw_enter(&zfs_snapshot_lock, RW_WRITER); rw_enter(&zfs_snapshot_lock, RW_WRITER);
se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
if (se != NULL)
zfsctl_snapshot_remove(se);
rw_exit(&zfs_snapshot_lock);
if (se != NULL) { if (se != NULL) {
zfsctl_snapshot_remove(se);
/*
* Don't wait if snapentry_expire task is calling
* umount, which may have resulted in this destroy
* call. Waiting would deadlock: snapentry_expire
* waits for umount while umount waits for task.
*/
zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_cancel(se);
zfsctl_snapshot_rele(se); zfsctl_snapshot_rele(se);
} }
rw_exit(&zfs_snapshot_lock);
} else if (zfsvfs->z_ctldir) { } else if (zfsvfs->z_ctldir) {
iput(zfsvfs->z_ctldir); iput(zfsvfs->z_ctldir);
zfsvfs->z_ctldir = NULL; zfsvfs->z_ctldir = NULL;

View File

@ -573,7 +573,8 @@ zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
if (zfsvfs->z_draining) { if (zfsvfs->z_draining) {
zfsvfs->z_drain_cancel = B_TRUE; zfsvfs->z_drain_cancel = B_TRUE;
taskq_cancel_id(dsl_pool_unlinked_drain_taskq( taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task,
B_TRUE);
zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_drain_task = TASKQID_INVALID;
zfsvfs->z_draining = B_FALSE; zfsvfs->z_draining = B_FALSE;
} }

View File

@ -1452,7 +1452,8 @@ dmu_objset_upgrade_stop(objset_t *os)
os->os_upgrade_id = 0; os->os_upgrade_id = 0;
mutex_exit(&os->os_upgrade_lock); mutex_exit(&os->os_upgrade_lock);
if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) { if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id,
B_TRUE)) == 0) {
dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
} }
txg_wait_synced(os->os_spa->spa_dsl_pool, 0); txg_wait_synced(os->os_spa->spa_dsl_pool, 0);

View File

@ -1934,7 +1934,7 @@ spa_deactivate(spa_t *spa)
list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_evicting_os_list);
list_destroy(&spa->spa_state_dirty_list); list_destroy(&spa->spa_state_dirty_list);
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
for (int t = 0; t < ZIO_TYPES; t++) { for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
@ -10451,7 +10451,7 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_sync_starttime = gethrtime(); spa->spa_sync_starttime = gethrtime();
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
NSEC_TO_TICK(spa->spa_deadman_synctime)); NSEC_TO_TICK(spa->spa_deadman_synctime));
@ -10508,7 +10508,7 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_sync_rewrite_vdev_config(spa, tx); spa_sync_rewrite_vdev_config(spa, tx);
dmu_tx_commit(tx); dmu_tx_commit(tx);
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
spa->spa_deadman_tqid = 0; spa->spa_deadman_tqid = 0;
/* /*

View File

@ -1531,7 +1531,8 @@ zfs_ereport_taskq_fini(void)
{ {
mutex_enter(&recent_events_lock); mutex_enter(&recent_events_lock);
if (recent_events_cleaner_tqid != 0) { if (recent_events_cleaner_tqid != 0) {
taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid,
B_TRUE);
recent_events_cleaner_tqid = 0; recent_events_cleaner_tqid = 0;
} }
mutex_exit(&recent_events_lock); mutex_exit(&recent_events_lock);