OpenZFS 6569 - large file delete can starve out write ops

Authored by: Alek Pinchuk <alek@nexenta.com> Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: George Melikov <mail@gmelikov.ru> Tested-by: kernelOfTruth <kerneloftruth@gmail.com> OpenZFS-issue: https://www.illumos.org/issues/6569 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/1bf4b6f2 Closes #5706
2026-05-15 18:56:59 +03:00 · 2017-02-01 01:44:03 +03:00 · 2017-02-01 01:44:03 +03:00 · 539d33c791
commit 539d33c791
parent a873815b95
4 changed files with 94 additions and 5 deletions
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@ -21,6 +21,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
 */

 #ifndef	_SYS_DSL_POOL_H
@ -106,6 +107,7 @@ typedef struct dsl_pool {
 	kcondvar_t dp_spaceavail_cv;
 	uint64_t dp_dirty_pertxg[TXG_SIZE];
 	uint64_t dp_dirty_total;
+	uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
 	uint64_t dp_mos_used_delta;
 	uint64_t dp_mos_compressed_delta;
 	uint64_t dp_mos_uncompressed_delta;
--- a/include/sys/trace_dmu.h
+++ b/include/sys/trace_dmu.h
@ -112,6 +112,36 @@ DEFINE_EVENT(zfs_delay_mintime_class, name, \
 /* END CSTYLED */
 DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime);

+/* BEGIN CSTYLED */
+DECLARE_EVENT_CLASS(zfs_free_long_range_class,
+	TP_PROTO(uint64_t long_free_dirty_all_txgs, uint64_t chunk_len, \
+	    uint64_t txg),
+	TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg),
+	TP_STRUCT__entry(
+	    __field(uint64_t,			long_free_dirty_all_txgs)
+	    __field(uint64_t,			chunk_len)
+	    __field(uint64_t,			txg)
+	),
+	TP_fast_assign(
+	    __entry->long_free_dirty_all_txgs	= long_free_dirty_all_txgs;
+	    __entry->chunk_len					= chunk_len;
+	    __entry->txg						= txg;
+	),
+	TP_printk("long_free_dirty_all_txgs %llu chunk_len %llu txg %llu",
+	   __entry->long_free_dirty_all_txgs,
+	   __entry->chunk_len, __entry->txg)
+);
+/* END CSTYLED */
+
+/* BEGIN CSTYLED */
+#define	DEFINE_FREE_LONG_RANGE_EVENT(name) \
+DEFINE_EVENT(zfs_free_long_range_class, name, \
+	TP_PROTO(uint64_t long_free_dirty_all_txgs, \
+	    uint64_t chunk_len, uint64_t txg), \
+	TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg))
+/* END CSTYLED */
+DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range);
+
 #endif /* _TRACE_DMU_H */

 #undef TRACE_INCLUDE_PATH
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -48,6 +48,7 @@
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/trace_dmu.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
@ -58,6 +59,14 @@
 */
 int zfs_nopwrite_enabled = 1;

+/*
+ * Tunable to control percentage of dirtied blocks from frees in one TXG.
+ * After this threshold is crossed, additional dirty blocks from frees
+ * wait until the next TXG.
+ * A value of zero will disable this throttle.
+ */
+uint32_t zfs_per_txg_dirty_frees_percent = 30;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
@ -727,6 +736,9 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 {
 	uint64_t object_size;
 	int err;
+	uint64_t dirty_frees_threshold;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+	int t;

 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
@ -735,11 +747,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 	if (offset >= object_size)
 		return (0);

+	if (zfs_per_txg_dirty_frees_percent <= 100)
+		dirty_frees_threshold =
+		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
+	else
+		dirty_frees_threshold = zfs_dirty_data_max / 4;
+
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;

 	while (length != 0) {
-		uint64_t chunk_end, chunk_begin;
+		uint64_t chunk_end, chunk_begin, chunk_len;
+		uint64_t long_free_dirty_all_txgs = 0;
 		dmu_tx_t *tx;

 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
@ -754,9 +773,28 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);

+		chunk_len = chunk_end - chunk_begin;
+
+		mutex_enter(&dp->dp_lock);
+		for (t = 0; t < TXG_SIZE; t++) {
+			long_free_dirty_all_txgs +=
+			    dp->dp_long_free_dirty_pertxg[t];
+		}
+		mutex_exit(&dp->dp_lock);
+
+		/*
+		 * To avoid filling up a TXG with just frees wait for
+		 * the next TXG to open before freeing more chunks if
+		 * we have reached the threshold of frees
+		 */
+		if (dirty_frees_threshold != 0 &&
+		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
+			txg_wait_open(dp, 0);
+			continue;
+		}
+
 		tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, dn->dn_object,
-		    chunk_begin, chunk_end - chunk_begin);
+		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);

 		/*
 		 * Mark this transaction as typically resulting in a net
@ -768,10 +806,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 			dmu_tx_abort(tx);
 			return (err);
 		}
-		dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
+
+		mutex_enter(&dp->dp_lock);
+		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
+		    chunk_len;
+		mutex_exit(&dp->dp_lock);
+		DTRACE_PROBE3(free__long__range,
+		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
+		    uint64_t, dmu_tx_get_txg(tx));
+		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 		dmu_tx_commit(tx);

-		length -= chunk_end - chunk_begin;
+		length -= chunk_len;
 	}
 	return (0);
 }
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@ -23,6 +23,7 @@
 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
 * Copyright (c) 2013 Steven Hartland. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
 */

 #include <sys/dsl_pool.h>
@ -509,6 +510,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 */
 	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);

+	/*
+	 * Update the long range free counter after
+	 * we're done syncing user data
+	 */
+	mutex_enter(&dp->dp_lock);
+	ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+	    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+	mutex_exit(&dp->dp_lock);
+
 	/*
 	 * After the data blocks have been written (ensured by the zio_wait()
 	 * above), update the user/group space accounting.