From d393166c54dc310e4f183e3117254d45153302f5 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 10 Dec 2025 16:03:01 -0500 Subject: [PATCH] ARC: Increase parallel eviction batching Before parallel eviction implementation zfs_arc_evict_batch_limit caused loop exits after evicting 10 headers. The cost of it is not big and well motivated. Now though taskq task exit after the same 10 headers is much more expensive. To cover the context switch overhead of taskq introduce another level of batching, controlled by zfs_arc_evict_batches_limit tunable, used only for parallel eviction. My tests including 36 parallel reads with 4KB recordsize that shown 1.4GB/s (~460K blocks/s) before with heavy arc_evict_lock contention, now show 6.5GB/s (~1.6M blocks/s) without arc_evict_lock contention. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #17970 --- man/man4/zfs.4 | 6 +++++ module/zfs/arc.c | 69 +++++++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 60ec56b4d..a7e105a95 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -771,6 +771,12 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list. This batch-style operation prevents entire sub-lists from being evicted at once but comes at a cost of additional unlocking and locking. . +.It Sy zfs_arc_evict_batches_limit Ns = Ns Sy 5 Pq uint +Number of +.Sy zfs_arc_evict_batch_limit +batches to process per parallel eviction task under heavy load to reduce number +of context switches. +. .It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int Sets the number of ARC eviction threads to be used. .Pp diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 39b92f812..d73678801 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -371,6 +371,12 @@ static uint_t zfs_arc_eviction_pct = 200; */ static uint_t zfs_arc_evict_batch_limit = 10; +/* + * Number batches to process per parallel eviction task under heavy load to + * reduce number of context switches. + */ +static uint_t zfs_arc_evict_batches_limit = 5; + /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; @@ -3899,7 +3905,7 @@ arc_set_need_free(void) static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, uint64_t bytes) + uint64_t spa, uint64_t bytes, boolean_t *more) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; @@ -3983,6 +3989,10 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, multilist_sublist_unlock(mls); + /* Indicate if another iteration may be productive. */ + if (more) + *more = (hdr != NULL); + /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is @@ -4003,21 +4013,12 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); - cv_broadcast(&aw->aew_cv); + cv_signal(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); - /* - * If the ARC size is reduced from arc_c_max to arc_c_min (especially - * if the average cached block is small), eviction can be on-CPU for - * many seconds. To ensure that other threads that may be bound to - * this CPU are able to make progress, make a voluntary preemption - * call here. - */ - kpreempt(KPREEMPT_SYNC); - return (bytes_evicted); } @@ -4078,8 +4079,18 @@ static void arc_evict_task(void *arg) { evict_arg_t *eva = arg; - eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx, - eva->eva_marker, eva->eva_spa, eva->eva_bytes); + uint64_t total_evicted = 0; + boolean_t more; + uint_t batches = zfs_arc_evict_batches_limit; + + /* Process multiple batches to amortize taskq dispatch overhead. */ + do { + total_evicted += arc_evict_state_impl(eva->eva_ml, + eva->eva_idx, eva->eva_marker, eva->eva_spa, + eva->eva_bytes - total_evicted, &more); + } while (total_evicted < eva->eva_bytes && --batches > 0 && more); + + eva->eva_evicted = total_evicted; } static void @@ -4220,18 +4231,19 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, if (bytes == ARC_EVICT_ALL) { evict = bytes; - } else if (left > ntasks * MIN_EVICT_SIZE) { + } else if (left >= ntasks * MIN_EVICT_SIZE) { evict = DIV_ROUND_UP(left, ntasks); } else { - ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE); - if (ntasks == 1) + ntasks = left / MIN_EVICT_SIZE; + if (ntasks < 2) use_evcttq = B_FALSE; + else + evict = DIV_ROUND_UP(left, ntasks); } } for (int i = 0; sublists_left > 0; i++, sublist_idx++, sublists_left--) { - uint64_t bytes_remaining; uint64_t bytes_evicted; /* we've reached the end, wrap to the beginning */ @@ -4253,16 +4265,17 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, continue; } - if (total_evicted < bytes) - bytes_remaining = bytes - total_evicted; - else - break; - bytes_evicted = arc_evict_state_impl(ml, sublist_idx, - markers[sublist_idx], spa, bytes_remaining); + markers[sublist_idx], spa, bytes - total_evicted, + NULL); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; + + if (total_evicted < bytes) + kpreempt(KPREEMPT_SYNC); + else + break; } if (use_evcttq) { @@ -4887,7 +4900,7 @@ arc_evict_cb(void *arg, zthr_t *zthr) */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { - cv_broadcast(&aw->aew_cv); + cv_signal(&aw->aew_cv); } arc_set_need_free(); } @@ -5168,9 +5181,8 @@ arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve) uint64_t last_count = 0; mutex_enter(&arc_evict_lock); - if (!list_is_empty(&arc_evict_waiters)) { - arc_evict_waiter_t *last = - list_tail(&arc_evict_waiters); + arc_evict_waiter_t *last; + if ((last = list_tail(&arc_evict_waiters)) != NULL) { last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; @@ -11288,6 +11300,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batches_limit, UINT, ZMOD_RW, + "The number of batches to run per parallel eviction task"); + ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads");