ARC: parallel eviction

On systems with enormous amounts of memory, the single arc_evict thread can become a bottleneck if reads and writes are stuck behind it, waiting for old data to be evicted before new data can take its place. This commit adds support for evicting from multiple ARC lists in parallel, by farming the evict work out to some number of threads and then accumulating their results. A new tuneable, zfs_arc_evict_threads, sets the number of threads. By default, it will scale based on the number of CPUs. Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Youzhong Yang <youzhong@gmail.com> Signed-off-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Mateusz Piotrowski <mateusz.piotrowski@klarasystems.com> Signed-off-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Co-authored-by: Rob Norris <rob.norris@klarasystems.com> Co-authored-by: Mateusz Piotrowski <mateusz.piotrowski@klarasystems.com> Co-authored-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com> Closes #16486
2025-08-05 14:37:40 +03:00 · 2025-05-14 10:38:32 -04:00 · 2025-05-14 10:38:32 -04:00 · b6916f995e
commit b6916f995e
parent 89a8a91582
2 changed files with 233 additions and 21 deletions
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@ -3,7 +3,7 @@
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
-.\" Copyright (c) 2023, 2024 Klara, Inc.
+.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@ -17,9 +17,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.\" Copyright (c) 2024, Klara, Inc.
-.\"
-.Dd November 1, 2024
+.Dd May 7, 2025
 .Dt ZFS 4
 .Os
 .
@ -740,6 +738,40 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
 This batch-style operation prevents entire sub-lists from being evicted at once
 but comes at a cost of additional unlocking and locking.
 .
+.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
+Sets the number of ARC eviction threads to be used.
+.Pp
+If set greater than 0, ZFS will dedicate up to that many threads to ARC
+eviction.
+Each thread will process one sub-list at a time,
+until the eviction target is reached or all sub-lists have been processed.
+When set to 0, ZFS will compute a reasonable number of eviction threads based
+on the number of CPUs.
+.TS
+box;
+lb l l .
+	CPUs	Threads
+_
+	1-4	1
+	5-8	2
+	9-15	3
+	16-31	4
+	32-63	6
+	64-95	8
+	96-127	9
+	128-160	11
+	160-191	12
+	192-223	13
+	224-255	14
+	256+	16
+.TE
+.Pp
+More threads may improve the responsiveness of ZFS to memory pressure.
+This can be important for performance when eviction from the ARC becomes
+a bottleneck for reads and writes.
+.Pp
+This parameter can only be set at module load time.
+.
 .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
 If set to a non zero value, it will replace the
 .Sy arc_grow_retry
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -27,7 +27,7 @@
 * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
 * Copyright (c) 2020, George Amanakis. All rights reserved.
- * Copyright (c) 2019, 2024, Klara Inc.
+ * Copyright (c) 2019, 2024, 2025, Klara, Inc.
 * Copyright (c) 2019, Allan Jude
 * Copyright (c) 2020, The FreeBSD Foundation [1]
 * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@ -337,6 +337,9 @@ static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 static clock_t arc_last_uncached_flush;

+static taskq_t *arc_evict_taskq;
+static struct evict_arg *arc_evict_arg;
+
 /*
 * Count of bytes evicted since boot.
 */
@ -470,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
 /* Used by spa_export/spa_destroy to flush the arc asynchronously */
 static taskq_t *arc_flush_taskq;

+/*
+ * Controls the number of ARC eviction threads to dispatch sublists to.
+ *
+ * Possible values:
+ * 0  (auto) compute the number of threads using a logarithmic formula.
+ * 1  (disabled) one thread - parallel eviction is disabled.
+ * 2+ (manual) set the number manually.
+ *
+ * See arc_evict_thread_init() for how "auto" is computed.
+ */
+static uint_t zfs_arc_evict_threads = 0;
+
 /* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
@ -4049,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 	kmem_free(markers, sizeof (*markers) * count);
 }

+typedef struct evict_arg {
+	taskq_ent_t		eva_tqent;
+	multilist_t		*eva_ml;
+	arc_buf_hdr_t		*eva_marker;
+	int			eva_idx;
+	uint64_t		eva_spa;
+	uint64_t		eva_bytes;
+	uint64_t		eva_evicted;
+} evict_arg_t;
+
+static void
+arc_evict_task(void *arg)
+{
+	evict_arg_t *eva = arg;
+	eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
+	    eva->eva_marker, eva->eva_spa, eva->eva_bytes);
+}
+
+static void
+arc_evict_thread_init(void)
+{
+	if (zfs_arc_evict_threads == 0) {
+		/*
+		 * Compute number of threads we want to use for eviction.
+		 *
+		 * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
+		 * default max of 16 threads at ~256 CPUs.
+		 *
+		 * However, that formula goes to two threads at 4 CPUs, which
+		 * is still rather to low to be really useful, so we just go
+		 * with 1 thread at fewer than 6 cores.
+		 */
+		if (max_ncpus < 6)
+			zfs_arc_evict_threads = 1;
+		else
+			zfs_arc_evict_threads =
+			    (highbit64(max_ncpus) - 1) + max_ncpus / 32;
+	} else if (zfs_arc_evict_threads > max_ncpus)
+		zfs_arc_evict_threads = max_ncpus;
+
+	if (zfs_arc_evict_threads > 1) {
+		arc_evict_taskq = taskq_create("arc_evict",
+		    zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
+		    TASKQ_PREPOPULATE);
+		arc_evict_arg = kmem_zalloc(
+		    sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
+	}
+}
+
+/*
+ * The minimum number of bytes we can evict at once is a block size.
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
+ * We use this value to compute a scaling factor for the eviction tasks.
+ */
+#define	MIN_EVICT_SIZE	(SPA_MAXBLOCKSIZE)
+
 /*
 * Evict buffers from the given arc state, until we've removed the
 * specified number of bytes. Move the removed buffers to the
@ -4070,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	multilist_t *ml = &state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
+	evict_arg_t *eva = NULL;

 	num_sublists = multilist_get_num_sublists(ml);

+	boolean_t use_evcttq = zfs_arc_evict_threads > 1;
+
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
@ -4094,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 		multilist_sublist_unlock(mls);
 	}

+	if (use_evcttq) {
+		if (zthr_iscurthread(arc_evict_zthr))
+			eva = arc_evict_arg;
+		else
+			eva = kmem_alloc(sizeof (evict_arg_t) *
+			    zfs_arc_evict_threads, KM_NOSLEEP);
+		if (eva) {
+			for (int i = 0; i < zfs_arc_evict_threads; i++) {
+				taskq_init_ent(&eva[i].eva_tqent);
+				eva[i].eva_ml = ml;
+				eva[i].eva_spa = spa;
+			}
+		} else {
+			/*
+			 * Fall back to the regular single evict if it is not
+			 * possible to allocate memory for the taskq entries.
+			 */
+			use_evcttq = B_FALSE;
+		}
+	}
+
+	/*
+	 * Start eviction using a randomly selected sublist, this is to try and
+	 * evenly balance eviction across all sublists. Always starting at the
+	 * same sublist (e.g. index 0) would cause evictions to favor certain
+	 * sublists over others.
+	 */
+	uint64_t scan_evicted = 0;
+	int sublists_left = num_sublists;
+	int sublist_idx = multilist_get_random_index(ml);
+
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes) {
-		int sublist_idx = multilist_get_random_index(ml);
-		uint64_t scan_evicted = 0;
+		uint64_t evict = MIN_EVICT_SIZE;
+		uint_t ntasks = zfs_arc_evict_threads;

-		/*
-		 * Start eviction using a randomly selected sublist,
-		 * this is to try and evenly balance eviction across all
-		 * sublists. Always starting at the same sublist
-		 * (e.g. index 0) would cause evictions to favor certain
-		 * sublists over others.
-		 */
-		for (int i = 0; i < num_sublists; i++) {
+		if (use_evcttq) {
+			if (sublists_left < ntasks)
+				ntasks = sublists_left;
+
+			if (ntasks < 2)
+				use_evcttq = B_FALSE;
+		}
+
+		if (use_evcttq) {
+			uint64_t left = bytes - total_evicted;
+
+			if (bytes == ARC_EVICT_ALL) {
+				evict = bytes;
+			} else if (left > ntasks * MIN_EVICT_SIZE) {
+				evict = DIV_ROUND_UP(left, ntasks);
+			} else {
+				ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
+				if (ntasks == 1)
+					use_evcttq = B_FALSE;
+			}
+		}
+
+		for (int i = 0; sublists_left > 0; i++, sublist_idx++,
+		    sublists_left--) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;

+			/* we've reached the end, wrap to the beginning */
+			if (sublist_idx >= num_sublists)
+				sublist_idx = 0;
+
+			if (use_evcttq) {
+				if (i == ntasks)
+					break;
+
+				eva[i].eva_marker = markers[sublist_idx];
+				eva[i].eva_idx = sublist_idx;
+				eva[i].eva_bytes = evict;
+
+				taskq_dispatch_ent(arc_evict_taskq,
+				    arc_evict_task, &eva[i], 0,
+				    &eva[i].eva_tqent);
+
+				continue;
+			}
+
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
@ -4123,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,

 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
+		}

-			/* we've reached the end, wrap to the beginning */
-			if (++sublist_idx >= num_sublists)
-				sublist_idx = 0;
+		if (use_evcttq) {
+			taskq_wait(arc_evict_taskq);
+
+			for (int i = 0; i < ntasks; i++) {
+				scan_evicted += eva[i].eva_evicted;
+				total_evicted += eva[i].eva_evicted;
+			}
 		}

 		/*
-		 * If we didn't evict anything during this scan, we have
-		 * no reason to believe we'll evict more during another
+		 * If we scanned all sublists and didn't evict anything, we
+		 * have no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
-		if (scan_evicted == 0) {
+		if (scan_evicted == 0 && sublists_left == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);

@ -4151,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,

 			break;
 		}
+
+		/*
+		 * If we scanned all sublists but still have more to do,
+		 * reset the counts so we can go around again.
+		 */
+		if (sublists_left == 0) {
+			sublists_left = num_sublists;
+			sublist_idx = multilist_get_random_index(ml);
+			scan_evicted = 0;
+
+			/*
+			 * Since we're about to reconsider all sublists,
+			 * re-enable use of the evict threads if available.
+			 */
+			use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
+		}
 	}

+	if (eva != NULL && eva != arc_evict_arg)
+		kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
+
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
+
 	if (markers != arc_state_evict_markers)
 		arc_state_free_markers(markers, num_sublists);

@ -7805,6 +7970,7 @@ arc_set_limits(uint64_t allmem)
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
+
 void
 arc_init(void)
 {
@ -7882,6 +8048,8 @@ arc_init(void)
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);

+	arc_evict_thread_init();
+
 	list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
 	    offsetof(arc_async_flush_t, af_node));
 	mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
@ -7982,11 +8150,20 @@ arc_fini(void)
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);

+	if (arc_evict_taskq != NULL)
+		taskq_wait(arc_evict_taskq);
+
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 	arc_state_free_markers(arc_state_evict_markers,
 	    arc_state_evict_marker_count);

+	if (arc_evict_taskq != NULL) {
+		taskq_destroy(arc_evict_taskq);
+		kmem_free(arc_evict_arg,
+		    sizeof (evict_arg_t) * zfs_arc_evict_threads);
+	}
+
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);

@ -11110,3 +11287,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,

 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
 	"Number of arc_prune threads");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
+	"Number of threads to use for ARC eviction.");