Make arc_prune() asynchronous

As described in the comment above arc_adapt_thread() it is critical that the arc_adapt_thread() function never sleep while holding a hash lock. This behavior was possible in the Linux implementation because the arc_prune() logic was implemented to be synchronous. Under illumos the analogous dnlc_reduce_cache() function is asynchronous. To address this the arc_do_user_prune() function is has been reworked in to two new functions as follows: * arc_prune_async() is an asynchronous implementation which dispatches the prune callback to be run by the system taskq. This makes it suitable to use in the context of the arc_adapt_thread(). * arc_prune() is a synchronous implementation which depends on the arc_prune_async() implementation but blocks until the outstanding callbacks complete. This is used in arc_kmem_reap_now() where it is safe, and expected, that memory will be freed. This patch additionally adds the zfs_arc_meta_strategy module option while allows the meta reclaim strategy to be configured. It defaults to a balanced strategy which has been proved to work well under Linux but the illumos meta-only strategy can be enabled. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2026-01-25 10:12:13 +03:00 · 2015-05-30 09:57:53 -05:00 · 2015-05-30 09:57:53 -05:00 · f604673836
commit f604673836
parent c5528b9ba6
2 changed files with 120 additions and 41 deletions
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@ -59,10 +59,16 @@ arc_done_func_t arc_getbuf_func;
 struct arc_prune {
 	arc_prune_func_t	*p_pfunc;
 	void			*p_private;
 	uint64_t		p_adjust;
 	list_node_t		p_node;
 	refcount_t		p_refcnt;
 };
 typedef enum arc_strategy {
 	ARC_STRATEGY_META_ONLY		= 0, /* Evict only meta data buffers */
 	ARC_STRATEGY_META_BALANCED	= 1, /* Evict data buffers if needed */
 } arc_strategy_t;
 typedef enum arc_flags
 {
 	/*
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -167,6 +167,9 @@ static boolean_t	arc_user_evicts_thread_exit;
 /* number of objects to prune from caches when arc_meta_limit is reached */
 int zfs_arc_meta_prune = 10000;
 /* The preferred strategy to employ when arc_meta_limit is reached */
 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 typedef enum arc_reclaim_strategy {
 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
@ -531,6 +534,7 @@ static arc_state_t	*arc_l2c_only;
 static list_t arc_prune_list;
 static kmutex_t arc_prune_mtx;
 static taskq_t *arc_prune_taskq;
 static arc_buf_t *arc_eviction_list;
 static arc_buf_hdr_t arc_eviction_hdr;
@ -2430,47 +2434,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
 }
 /*
- * Request that arc user drop references so that N bytes can be released
+ * Helper function for arc_prune() it is responsible for safely handling
- * from the cache.  This provides a mechanism to ensure the arc can honor
+ * the execution of a registered arc_prune_func_t.
 * the arc_meta_limit and reclaim buffers which are pinned in the cache
 * by higher layers.  (i.e. the zpl)
 */
 static void
-arc_do_user_prune(int64_t adjustment)
+arc_prune_task(void *ptr)
 {
-	arc_prune_func_t *func;
+	arc_prune_t *ap = (arc_prune_t *)ptr;
-	void *private;
+	arc_prune_func_t *func = ap->p_pfunc;
-	arc_prune_t *cp, *np;
+
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 	/* Callback unregistered concurrently with execution */
 	if (refcount_remove(&ap->p_refcnt, func) == 0) {
 		ASSERT(!list_link_active(&ap->p_node));
 		refcount_destroy(&ap->p_refcnt);
 		kmem_free(ap, sizeof (*ap));
 	}
 }
 /*
 * Notify registered consumers they must drop holds on a portion of the ARC
 * buffered they reference.  This provides a mechanism to ensure the ARC can
 * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
 * is analogous to dnlc_reduce_cache() but more generic.
 *
 * This operation is performed asyncronously so it may be safely called
 * in the context of the arc_adapt_thread().  A reference is taken here
 * for each registered arc_prune_t and the arc_prune_task() is responsible
 * for releasing it once the registered arc_prune_func_t has completed.
 */
 static void
 arc_prune_async(int64_t adjust)
 {
 	arc_prune_t *ap;
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
-	cp = list_head(&arc_prune_list);
+		if (refcount_count(&ap->p_refcnt) >= 2)
-	while (cp != NULL) {
+			continue;
 		func = cp->p_pfunc;
 		private = cp->p_private;
 		np = list_next(&arc_prune_list, cp);
 		refcount_add(&cp->p_refcnt, func);
 		mutex_exit(&arc_prune_mtx);
-		if (func != NULL)
+		refcount_add(&ap->p_refcnt, ap->p_pfunc);
-			func(adjustment, private);
+		ap->p_adjust = adjust;
-
+		taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
-		mutex_enter(&arc_prune_mtx);
+		ARCSTAT_BUMP(arcstat_prune);
 		/* User removed prune callback concurrently with execution */
 		if (refcount_remove(&cp->p_refcnt, func) == 0) {
 			ASSERT(!list_link_active(&cp->p_node));
 			refcount_destroy(&cp->p_refcnt);
 			kmem_free(cp, sizeof (*cp));
 		}
 		cp = np;
 	}
 	ARCSTAT_BUMP(arcstat_prune);
 	mutex_exit(&arc_prune_mtx);
 }
 static void
 arc_prune(int64_t adjust)
 {
 	arc_prune_async(adjust);
 	taskq_wait_outstanding(arc_prune_taskq, 0);
 }
 /*
 * Evict the specified number of bytes from the state specified,
 * restricting eviction to the spa and type given. This function
@ -2511,7 +2532,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
 * available for reclaim.
 */
 static uint64_t
-arc_adjust_meta(void)
+arc_adjust_meta_balanced(void)
 {
 	int64_t adjustmnt, delta, prune = 0;
 	uint64_t total_evicted = 0;
@ -2580,7 +2601,7 @@ restart:
 			if (zfs_arc_meta_prune) {
 				prune += zfs_arc_meta_prune;
-				arc_do_user_prune(prune);
+				arc_prune_async(prune);
 			}
 		}
@ -2592,6 +2613,50 @@ restart:
 	return (total_evicted);
 }
 /*
 * Evict metadata buffers from the cache, such that arc_meta_used is
 * capped by the arc_meta_limit tunable.
 */
 static uint64_t
 arc_adjust_meta_only(void)
 {
 	uint64_t total_evicted = 0;
 	int64_t target;
 	/*
 	 * If we're over the meta limit, we want to evict enough
 	 * metadata to get back under the meta limit. We don't want to
 	 * evict so much that we drop the MRU below arc_p, though. If
 	 * we're over the meta limit more than we're over arc_p, we
 	 * evict some from the MRU here, and some from the MFU below.
 	 */
 	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
 	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
 	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 	/*
 	 * Similar to the above, we want to evict enough bytes to get us
 	 * below the meta limit, but not so much as to drop us below the
 	 * space alloted to the MFU (which is defined as arc_c - arc_p).
 	 */
 	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
 	    (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p)));
 	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 	return (total_evicted);
 }
 static uint64_t
 arc_adjust_meta(void)
 {
 	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
 		return (arc_adjust_meta_only());
 	else
 		return (arc_adjust_meta_balanced());
 }
 /*
 * Return the type of the oldest buffer in the given arc state
 *
@ -2905,6 +2970,14 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
 	if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
 		/*
 		 * We are exceeding our meta-data cache limit.
 		 * Prune some entries to release holds on meta-data.
 		 */
 		arc_prune(zfs_arc_meta_prune);
 	}
 	/*
 	 * An aggressive reclamation will shrink the cache size as well as
 	 * reap free buffers from the arc kmem caches.
@ -2929,15 +3002,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
 }
 /*
 * Unlike other ZFS implementations this thread is only responsible for
 * adapting the target ARC size on Linux.  The responsibility for memory
 * reclamation has been entirely delegated to the arc_shrinker_func()
 * which is registered with the VM.  To reflect this change in behavior
 * the arc_reclaim thread has been renamed to arc_adapt.
 *
 * The following comment from arc_reclaim_thread() in illumos is still
 * applicable:
 *
 * Threads can block in arc_get_data_buf() waiting for this thread to evict
 * enough data and signal them to proceed. When this happens, the threads in
 * arc_get_data_buf() are sleeping while holding the hash lock for their
@ -4862,6 +4926,9 @@ arc_init(void)
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
 	arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
 	    max_ncpus, INT_MAX, TASKQ_PREPOPULATE);
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@ -4943,6 +5010,9 @@ arc_fini(void)
 		arc_ksp = NULL;
 	}
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_head(&arc_prune_list)) != NULL) {
 		list_remove(&arc_prune_list, p);
@ -6374,6 +6444,9 @@ module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
 	"Limit number of restarts in arc_adjust_meta");
 module_param(zfs_arc_meta_strategy, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
 module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");