Fix snapshot automount expiry cancellation deadlock

A deadlock occurs when snapshot expiry tasks are cancelled while holding locks. The snapshot expiry task (snapentry_expire) spawns an umount process and waits for it to complete. Concurrently, ARC memory pressure triggers arc_prune which calls zfs_exit_fs(), attempting to cancel the expiry task while holding locks. The umount process spawned by the expiry task blocks trying to acquire locks held by arc_prune, which is blocked waiting for the expiry task to complete. This creates a circular dependency: expiry task waits for umount, umount waits for arc_prune, arc_prune waits for expiry task. Fix by adding non-blocking cancellation support to taskq_cancel_id(). The zfs_exit_fs() path calls zfsctl_snapshot_unmount_delay() to reschedule the unmount, which needs to cancel any existing expiry task. It now uses non-blocking cancellation to avoid waiting while holding locks, breaking the deadlock by returning immediately when the task is already running. The per-entry se_taskqid_lock has been removed, with all taskqid operations now protected by the global zfs_snapshot_lock held as WRITER. Additionally, an se_in_umount flag prevents recursive waits when zfsctl_destroy() is called during unmount. The taskqid is now only cleared by the caller on successful cancellation; running tasks clear their own taskqid upon completion. Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #17941
2026-01-25 10:12:13 +03:00 · 2025-12-02 03:43:42 +05:00 · 2025-12-02 03:43:42 +05:00 · 74bbdda1ef
commit 74bbdda1ef
parent 663dc86de2
12 changed files with 69 additions and 48 deletions
--- a/include/os/freebsd/spl/sys/taskq.h
+++ b/include/os/freebsd/spl/sys/taskq.h
@ -107,7 +107,7 @@ extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
 extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern void taskq_wait(taskq_t *);
-extern int taskq_cancel_id(taskq_t *, taskqid_t);
+extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
 extern int taskq_member(taskq_t *, kthread_t *);
 extern taskq_t *taskq_of_curthread(void);
 void	taskq_suspend(taskq_t *);
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@ -198,7 +198,7 @@ extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
 extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern void taskq_wait(taskq_t *);
-extern int taskq_cancel_id(taskq_t *, taskqid_t);
+extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
 extern int taskq_member(taskq_t *, kthread_t *);
 extern taskq_t *taskq_of_curthread(void);

--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@ -509,7 +509,7 @@ extern void	taskq_wait_id(taskq_t *, taskqid_t);
 extern void	taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern int	taskq_member(taskq_t *, kthread_t *);
 extern taskq_t	*taskq_of_curthread(void);
-extern int	taskq_cancel_id(taskq_t *, taskqid_t);
+extern int	taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
 extern void	system_taskq_init(void);
 extern void	system_taskq_fini(void);

--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@ -390,9 +390,9 @@ taskq_of_curthread(void)
 }

 int
-taskq_cancel_id(taskq_t *tq, taskqid_t id)
+taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
 {
-	(void) tq, (void) id;
+	(void) tq, (void) id, (void) wait;
 	return (ENOENT);
 }

--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@ -351,7 +351,7 @@ taskq_free(taskq_ent_t *task)
 }

 int
-taskq_cancel_id(taskq_t *tq, taskqid_t tid)
+taskq_cancel_id(taskq_t *tq, taskqid_t tid, boolean_t wait)
 {
 	uint32_t pend;
 	int rc;
@ -362,12 +362,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)

 	if (ent->tqent_type == NORMAL_TASK) {
 		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
-		if (rc == EBUSY)
+		if (rc == EBUSY && wait)
 			taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	} else {
 		rc = taskqueue_cancel_timeout(tq->tq_queue,
 		    &ent->tqent_timeout_task, &pend);
-		if (rc == EBUSY) {
+		if (rc == EBUSY && wait) {
 			taskqueue_drain_timeout(tq->tq_queue,
 			    &ent->tqent_timeout_task);
 		}
@ -381,6 +381,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
 	}
 	/* Free the extra reference we added with taskq_lookup. */
 	taskq_free(ent);
+
+	/*
+	 * If task was running and we didn't wait, return EBUSY.
+	 * Otherwise return 0 if cancelled or ENOENT if not found.
+	 */
+	if (rc == EBUSY && !wait)
+		return (EBUSY);
 	return (pend ? 0 : ENOENT);
 }

--- a/module/os/linux/spl/spl-kmem-cache.c
+++ b/module/os/linux/spl/spl-kmem-cache.c
@ -840,7 +840,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	id = skc->skc_taskqid;
 	spin_unlock(&skc->skc_lock);

-	taskq_cancel_id(spl_kmem_cache_taskq, id);
+	taskq_cancel_id(spl_kmem_cache_taskq, id, B_TRUE);

 	/*
 	 * Wait until all current callers complete, this is mainly
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@ -600,13 +600,22 @@ taskq_of_curthread(void)
 EXPORT_SYMBOL(taskq_of_curthread);

 /*
- * Cancel an already dispatched task given the task id.  Still pending tasks
- * will be immediately canceled, and if the task is active the function will
- * block until it completes.  Preallocated tasks which are canceled must be
- * freed by the caller.
+ * Cancel a dispatched task. Pending tasks are cancelled immediately.
+ * If the task is running, behavior depends on wait parameter:
+ *   - wait=B_TRUE: Block until task completes
+ *   - wait=B_FALSE: Return EBUSY immediately
+ *
+ * Return values:
+ *   0      - Cancelled before execution. Caller must release resources.
+ *   EBUSY  - Task running (wait=B_FALSE only). Will self-cleanup.
+ *   ENOENT - Not found, or completed after waiting. Already cleaned up.
+ *
+ * Note: wait=B_TRUE returns ENOENT (not EBUSY) after waiting because
+ * the task no longer exists. This distinguishes "cancelled before run"
+ * from "completed naturally" for proper resource management.
 */
 int
-taskq_cancel_id(taskq_t *tq, taskqid_t id)
+taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
 {
 	taskq_ent_t *t;
 	int rc = ENOENT;
@ -669,8 +678,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	spin_unlock_irqrestore(&tq->tq_lock, flags);

 	if (t == ERR_PTR(-EBUSY)) {
-		taskq_wait_id(tq, id);
-		rc = EBUSY;
+		if (wait) {
+			taskq_wait_id(tq, id);
+			rc = ENOENT;  /* Completed, no longer exists */
+		} else {
+			rc = EBUSY;   /* Still running */
+		}
 	}

 	return (rc);
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@ -120,7 +120,6 @@ typedef struct {
 	spa_t		*se_spa;	/* pool spa */
 	uint64_t	se_objsetid;	/* snapshot objset id */
 	struct dentry   *se_root_dentry; /* snapshot root dentry */
-	krwlock_t	se_taskqid_lock;  /* scheduled unmount taskqid lock */
 	taskqid_t	se_taskqid;	/* scheduled unmount taskqid */
 	avl_node_t	se_node_name;	/* zfs_snapshots_by_name link */
 	avl_node_t	se_node_objsetid; /* zfs_snapshots_by_objsetid link */
@ -147,7 +146,6 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
 	se->se_objsetid = objsetid;
 	se->se_root_dentry = root_dentry;
 	se->se_taskqid = TASKQID_INVALID;
-	rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);

 	zfs_refcount_create(&se->se_refcount);

@ -164,7 +162,6 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
 	zfs_refcount_destroy(&se->se_refcount);
 	kmem_strfree(se->se_name);
 	kmem_strfree(se->se_path);
-	rw_destroy(&se->se_taskqid_lock);

 	kmem_free(se, sizeof (zfs_snapentry_t));
 }
@ -340,17 +337,15 @@ snapentry_expire(void *data)
 		return;
 	}

-	rw_enter(&se->se_taskqid_lock, RW_WRITER);
-	se->se_taskqid = TASKQID_INVALID;
-	rw_exit(&se->se_taskqid_lock);
 	(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
-	zfsctl_snapshot_rele(se);

 	/*
-	 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+	 * Clear taskqid and reschedule if the snapshot wasn't removed.
 	 * This can occur when the snapshot is busy.
 	 */
-	rw_enter(&zfs_snapshot_lock, RW_READER);
+	rw_enter(&zfs_snapshot_lock, RW_WRITER);
+	se->se_taskqid = TASKQID_INVALID;
+	zfsctl_snapshot_rele(se);
 	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
 		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
 		zfsctl_snapshot_rele(se);
@ -367,17 +362,17 @@ static void
 zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
 {
 	int err = 0;
-	rw_enter(&se->se_taskqid_lock, RW_WRITER);
-	err = taskq_cancel_id(system_delay_taskq, se->se_taskqid);
+
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+
+	err = taskq_cancel_id(system_delay_taskq, se->se_taskqid, B_FALSE);
 	/*
-	 * if we get ENOENT, the taskq couldn't be found to be
-	 * canceled, so we can just mark it as invalid because
-	 * it's already gone. If we got EBUSY, then we already
-	 * blocked until it was gone _anyway_, so we don't care.
+	 * Clear taskqid only if we successfully cancelled before execution.
+	 * For ENOENT, task already cleared it. For EBUSY, task will clear
+	 * it when done.
 	 */
-	se->se_taskqid = TASKQID_INVALID;
-	rw_exit(&se->se_taskqid_lock);
 	if (err == 0) {
+		se->se_taskqid = TASKQID_INVALID;
 		zfsctl_snapshot_rele(se);
 	}
 }
@ -388,12 +383,11 @@ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
 static void
 zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
 {
+	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));

 	if (delay <= 0)
 		return;

-	zfsctl_snapshot_hold(se);
-	rw_enter(&se->se_taskqid_lock, RW_WRITER);
 	/*
 	 * If this condition happens, we managed to:
 	 * - dispatch once
@ -404,13 +398,12 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
 	 * no problem.
 	 */
 	if (se->se_taskqid != TASKQID_INVALID) {
-		rw_exit(&se->se_taskqid_lock);
-		zfsctl_snapshot_rele(se);
 		return;
 	}
+
+	zfsctl_snapshot_hold(se);
 	se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
 	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
-	rw_exit(&se->se_taskqid_lock);
 }

 /*
@ -425,7 +418,7 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
 	zfs_snapentry_t *se;
 	int error = ENOENT;

-	rw_enter(&zfs_snapshot_lock, RW_READER);
+	rw_enter(&zfs_snapshot_lock, RW_WRITER);
 	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
 		zfsctl_snapshot_unmount_cancel(se);
 		zfsctl_snapshot_unmount_delay_impl(se, delay);
@ -614,13 +607,18 @@ zfsctl_destroy(zfsvfs_t *zfsvfs)

 		rw_enter(&zfs_snapshot_lock, RW_WRITER);
 		se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
-		if (se != NULL)
-			zfsctl_snapshot_remove(se);
-		rw_exit(&zfs_snapshot_lock);
 		if (se != NULL) {
+			zfsctl_snapshot_remove(se);
+			/*
+			 * Don't wait if snapentry_expire task is calling
+			 * umount, which may have resulted in this destroy
+			 * call. Waiting would deadlock: snapentry_expire
+			 * waits for umount while umount waits for task.
+			 */
 			zfsctl_snapshot_unmount_cancel(se);
 			zfsctl_snapshot_rele(se);
 		}
+		rw_exit(&zfs_snapshot_lock);
 	} else if (zfsvfs->z_ctldir) {
 		iput(zfsvfs->z_ctldir);
 		zfsvfs->z_ctldir = NULL;
--- a/module/os/linux/zfs/zfs_dir.c
+++ b/module/os/linux/zfs/zfs_dir.c
@ -573,7 +573,8 @@ zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
 	if (zfsvfs->z_draining) {
 		zfsvfs->z_drain_cancel = B_TRUE;
 		taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
-		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task,
+		    B_TRUE);
 		zfsvfs->z_drain_task = TASKQID_INVALID;
 		zfsvfs->z_draining = B_FALSE;
 	}
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@ -1452,7 +1452,8 @@ dmu_objset_upgrade_stop(objset_t *os)
 		os->os_upgrade_id = 0;
 		mutex_exit(&os->os_upgrade_lock);

-		if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
+		if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id,
+		    B_TRUE)) == 0) {
 			dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
 		}
 		txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@ -1934,7 +1934,7 @@ spa_deactivate(spa_t *spa)
 	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);

-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);

 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
@ -10451,7 +10451,7 @@ spa_sync(spa_t *spa, uint64_t txg)

 	spa->spa_sync_starttime = gethrtime();

-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
@ -10508,7 +10508,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);

-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
 	spa->spa_deadman_tqid = 0;

 	/*
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@ -1531,7 +1531,8 @@ zfs_ereport_taskq_fini(void)
 {
 	mutex_enter(&recent_events_lock);
 	if (recent_events_cleaner_tqid != 0) {
-		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
+		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid,
+		    B_TRUE);
 		recent_events_cleaner_tqid = 0;
 	}
 	mutex_exit(&recent_events_lock);