port async unlinked drain from illumos-nexenta

This patch is an async implementation of the existing sync zfs_unlinked_drain() function. This function is called at mount time and is responsible for freeing znodes that we didn't get to freeing before. We don't have to hold mounting of the dataset until the unlinked list is fully drained as is done now. Since we can process the unlinked set asynchronously this results in a better user experience when mounting a dataset with entries in the unlinked set. Reviewed by: Jorgen Lundman <lundman@lundman.net> Reviewed by: Tom Caputi <tcaputi@datto.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Signed-off-by: Alek Pinchuk <apinchuk@datto.com> Closes #8142
2026-05-22 02:27:36 +03:00 · 2019-02-12 10:41:15 -08:00
parent 425d3237ee
commit dcec0a12c8
13 changed files with 300 additions and 10 deletions
@@ -21,6 +21,7 @@

 /*
 * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2018 Datto Inc.
 */

 #include <sys/dataset_kstats.h>
@@ -34,6 +35,8 @@ static dataset_kstat_values_t empty_dataset_kstats = {
 	{ "nwritten",	KSTAT_DATA_UINT64 },
 	{ "reads",	KSTAT_DATA_UINT64 },
 	{ "nread",	KSTAT_DATA_UINT64 },
+	{ "nunlinks",	KSTAT_DATA_UINT64 },
+	{ "nunlinked",	KSTAT_DATA_UINT64 },
 };

 static int
@@ -54,6 +57,10 @@ dataset_kstats_update(kstat_t *ksp, int rw)
 	    aggsum_value(&dk->dk_aggsums.das_reads);
 	dkv->dkv_nread.value.ui64 =
 	    aggsum_value(&dk->dk_aggsums.das_nread);
+	dkv->dkv_nunlinks.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nunlinks);
+	dkv->dkv_nunlinked.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nunlinked);

 	return (0);
 }
@@ -136,6 +143,8 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 	aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
 	aggsum_init(&dk->dk_aggsums.das_reads, 0);
 	aggsum_init(&dk->dk_aggsums.das_nread, 0);
+	aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
+	aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
 }

 void
@@ -156,6 +165,8 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 	aggsum_fini(&dk->dk_aggsums.das_nwritten);
 	aggsum_fini(&dk->dk_aggsums.das_reads);
 	aggsum_fini(&dk->dk_aggsums.das_nread);
+	aggsum_fini(&dk->dk_aggsums.das_nunlinks);
+	aggsum_fini(&dk->dk_aggsums.das_nunlinked);
 }

 void
@@ -183,3 +194,21 @@ dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
 	aggsum_add(&dk->dk_aggsums.das_reads, 1);
 	aggsum_add(&dk->dk_aggsums.das_nread, nread);
 }
+
+void
+dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_nunlinks, delta);
+}
+
+void
+dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
+}
@@ -223,6 +223,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)

 	dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
 	    max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
+	    max_ncpus, defclsyspri, max_ncpus, INT_MAX,
+	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);

 	return (dp);
 }
@@ -413,6 +416,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	cv_destroy(&dp->dp_spaceavail_cv);
+	taskq_destroy(dp->dp_unlinked_drain_taskq);
 	taskq_destroy(dp->dp_iput_taskq);
 	if (dp->dp_blkstats != NULL) {
 		mutex_destroy(&dp->dp_blkstats->zab_lock);
@@ -1097,6 +1101,12 @@ dsl_pool_iput_taskq(dsl_pool_t *dp)
 	return (dp->dp_iput_taskq);
 }

+taskq_t *
+dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
+{
+	return (dp->dp_unlinked_drain_taskq);
+}
+
 /*
 * Walk through the pool-wide zap object of temporary snapshot user holds
 * and release them.
@@ -458,26 +458,31 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)

 	VERIFY3U(0, ==,
 	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
 }

 /*
 * Clean up any znodes that had no links when we either crashed or
 * (force) umounted the file system.
 */
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+static void
+zfs_unlinked_drain_task(void *arg)
 {
+	zfsvfs_t *zfsvfs = arg;
 	zap_cursor_t	zc;
 	zap_attribute_t zap;
 	dmu_object_info_t doi;
 	znode_t		*zp;
 	int		error;

+	ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
+
 	/*
 	 * Iterate over the contents of the unlinked set.
 	 */
 	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
-	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
 	    zap_cursor_advance(&zc)) {

 		/*
@@ -507,9 +512,61 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 			continue;

 		zp->z_unlinked = B_TRUE;
+
+		/*
+		 * iput() is Linux's equivalent to illumos' VN_RELE(). It will
+		 * decrement the inode's ref count and may cause the inode to be
+		 * synchronously freed. We interrupt freeing of this inode, by
+		 * checking the return value of dmu_objset_zfs_unmounting() in
+		 * dmu_free_long_range(), when an unmount is requested.
+		 */
 		iput(ZTOI(zp));
+		ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
 	}
 	zap_cursor_fini(&zc);
+
+	zfsvfs->z_draining = B_FALSE;
+	zfsvfs->z_drain_task = TASKQID_INVALID;
+}
+
+/*
+ * Sets z_draining then tries to dispatch async unlinked drain.
+ * If that fails executes synchronous unlinked drain.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+	ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
+
+	zfsvfs->z_draining = B_TRUE;
+	zfsvfs->z_drain_cancel = B_FALSE;
+
+	zfsvfs->z_drain_task = taskq_dispatch(
+	    dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
+	    zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
+	if (zfsvfs->z_drain_task == TASKQID_INVALID) {
+		zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
+		zfs_unlinked_drain_task(zfsvfs);
+	}
+}
+
+/*
+ * Wait for the unlinked drain taskq task to stop. This will interrupt the
+ * unlinked set processing if it is in progress.
+ */
+void
+zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
+{
+	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+
+	if (zfsvfs->z_draining) {
+		zfsvfs->z_drain_cancel = B_TRUE;
+		taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+		zfsvfs->z_drain_task = TASKQID_INVALID;
+		zfsvfs->z_draining = B_FALSE;
+	}
 }

 /*
@@ -684,6 +741,8 @@ zfs_rmnode(znode_t *zp)
 	VERIFY3U(0, ==,
 	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));

+	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
 	zfs_znode_delete(zp, tx);

 	dmu_tx_commit(tx);
@@ -1178,6 +1178,10 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 		return (error);
 	}

+	zfsvfs->z_drain_task = TASKQID_INVALID;
+	zfsvfs->z_draining = B_FALSE;
+	zfsvfs->z_drain_cancel = B_TRUE;
+
 	*zfvp = zfsvfs;
 	return (0);
 }
@@ -1200,14 +1204,27 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
-		if (readonly != 0)
+		if (readonly != 0) {
 			readonly_changed_cb(zfsvfs, B_FALSE);
-		else
+		} else {
+			zap_stats_t zs;
+			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+			    &zs) == 0) {
+				dataset_kstats_update_nunlinks_kstat(
+				    &zfsvfs->z_kstat, zs.zs_num_entries);
+			}
+			dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+			    "num_entries in unlinked set: %llu",
+			    zs.zs_num_entries);
 			zfs_unlinked_drain(zfsvfs);
+		}

 		/*
 		 * Parse and replay the intent log.
@@ -1250,9 +1267,6 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 		/* restore readonly bit */
 		if (readonly != 0)
 			readonly_changed_cb(zfsvfs, B_TRUE);
-
-		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
-		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
 	}

 	/*
@@ -1633,6 +1647,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
 	znode_t	*zp;

+	zfs_unlinked_drain_stop_wait(zfsvfs);
+
 	/*
 	 * If someone has not already unmounted this file system,
 	 * drain the iput_taskq to ensure all active references to the
@@ -1884,6 +1900,7 @@ zfs_preumount(struct super_block *sb)

 	/* zfsvfs is NULL when zfs_domount fails during mount */
 	if (zfsvfs) {
+		zfs_unlinked_drain_stop_wait(zfsvfs);
 		zfsctl_destroy(sb->s_fs_info);
 		/*
 		 * Wait for iput_async before entering evict_inodes in
@@ -2159,6 +2176,15 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);

+	if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
+		/*
+		 * zfs_suspend_fs() could have interrupted freeing
+		 * of dnodes. We need to restart this freeing so
+		 * that we don't "leak" the space.
+		 */
+		zfs_unlinked_drain(zfsvfs);
+	}
+
 bail:
 	/* release the VFS ops */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
@@ -91,6 +91,12 @@ static kmem_cache_t *znode_cache = NULL;
 static kmem_cache_t *znode_hold_cache = NULL;
 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;

+/*
+ * This is used by the test suite so that it can delay znodes from being
+ * freed in order to inspect the unlinked set.
+ */
+int zfs_unlink_suspend_progress = 0;
+
 /*
 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
 * z_rangelock. It will modify the offset and length of the lock to reflect
@@ -1339,7 +1345,7 @@ zfs_zinactive(znode_t *zp)
 	 */
 	if (zp->z_unlinked) {
 		ASSERT(!zfsvfs->z_issnap);
-		if (!zfs_is_readonly(zfsvfs)) {
+		if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
 			mutex_exit(&zp->z_lock);
 			zfs_znode_hold_exit(zfsvfs, zh);
 			zfs_rmnode(zp);
@@ -2214,4 +2220,7 @@ EXPORT_SYMBOL(zfs_obj_to_path);
 /* CSTYLED */
 module_param(zfs_object_mutex_size, uint, 0644);
 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
+module_param(zfs_unlink_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
+"(debug - leaks space into the unlinked set)");
 #endif