ARC: Increase parallel eviction batching

Before parallel eviction implementation zfs_arc_evict_batch_limit
caused loop exits after evicting 10 headers.  The cost of it is not
big and well motivated.  Now though taskq task exit after the same
10 headers is much more expensive.  To cover the context switch
overhead of taskq introduce another level of batching, controlled
by zfs_arc_evict_batches_limit tunable, used only for parallel
eviction.

My tests including 36 parallel reads with 4KB recordsize that shown
1.4GB/s (~460K blocks/s) before with heavy arc_evict_lock contention,
now show 6.5GB/s (~1.6M blocks/s) without arc_evict_lock contention.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17970
This commit is contained in:
Alexander Motin 2025-12-10 16:03:01 -05:00 committed by GitHub
parent 9fdb854109
commit d393166c54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 48 additions and 27 deletions

View File

@ -771,6 +771,12 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
This batch-style operation prevents entire sub-lists from being evicted at once
but comes at a cost of additional unlocking and locking.
.
.It Sy zfs_arc_evict_batches_limit Ns = Ns Sy 5 Pq uint
Number of
.Sy zfs_arc_evict_batch_limit
batches to process per parallel eviction task under heavy load to reduce number
of context switches.
.
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
Sets the number of ARC eviction threads to be used.
.Pp

View File

@ -371,6 +371,12 @@ static uint_t zfs_arc_eviction_pct = 200;
*/
static uint_t zfs_arc_evict_batch_limit = 10;
/*
* Number batches to process per parallel eviction task under heavy load to
* reduce number of context switches.
*/
static uint_t zfs_arc_evict_batches_limit = 5;
/* number of seconds before growing cache again */
uint_t arc_grow_retry = 5;
@ -3899,7 +3905,7 @@ arc_set_need_free(void)
static uint64_t
arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, uint64_t bytes)
uint64_t spa, uint64_t bytes, boolean_t *more)
{
multilist_sublist_t *mls;
uint64_t bytes_evicted = 0, real_evicted = 0;
@ -3983,6 +3989,10 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
multilist_sublist_unlock(mls);
/* Indicate if another iteration may be productive. */
if (more)
*more = (hdr != NULL);
/*
* Increment the count of evicted bytes, and wake up any threads that
* are waiting for the count to reach this value. Since the list is
@ -4003,21 +4013,12 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
while ((aw = list_head(&arc_evict_waiters)) != NULL &&
aw->aew_count <= arc_evict_count) {
list_remove(&arc_evict_waiters, aw);
cv_broadcast(&aw->aew_cv);
cv_signal(&aw->aew_cv);
}
}
arc_set_need_free();
mutex_exit(&arc_evict_lock);
/*
* If the ARC size is reduced from arc_c_max to arc_c_min (especially
* if the average cached block is small), eviction can be on-CPU for
* many seconds. To ensure that other threads that may be bound to
* this CPU are able to make progress, make a voluntary preemption
* call here.
*/
kpreempt(KPREEMPT_SYNC);
return (bytes_evicted);
}
@ -4078,8 +4079,18 @@ static void
arc_evict_task(void *arg)
{
evict_arg_t *eva = arg;
eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
eva->eva_marker, eva->eva_spa, eva->eva_bytes);
uint64_t total_evicted = 0;
boolean_t more;
uint_t batches = zfs_arc_evict_batches_limit;
/* Process multiple batches to amortize taskq dispatch overhead. */
do {
total_evicted += arc_evict_state_impl(eva->eva_ml,
eva->eva_idx, eva->eva_marker, eva->eva_spa,
eva->eva_bytes - total_evicted, &more);
} while (total_evicted < eva->eva_bytes && --batches > 0 && more);
eva->eva_evicted = total_evicted;
}
static void
@ -4220,18 +4231,19 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
if (bytes == ARC_EVICT_ALL) {
evict = bytes;
} else if (left > ntasks * MIN_EVICT_SIZE) {
} else if (left >= ntasks * MIN_EVICT_SIZE) {
evict = DIV_ROUND_UP(left, ntasks);
} else {
ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
if (ntasks == 1)
ntasks = left / MIN_EVICT_SIZE;
if (ntasks < 2)
use_evcttq = B_FALSE;
else
evict = DIV_ROUND_UP(left, ntasks);
}
}
for (int i = 0; sublists_left > 0; i++, sublist_idx++,
sublists_left--) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;
/* we've reached the end, wrap to the beginning */
@ -4253,16 +4265,17 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
continue;
}
if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
break;
bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
markers[sublist_idx], spa, bytes_remaining);
markers[sublist_idx], spa, bytes - total_evicted,
NULL);
scan_evicted += bytes_evicted;
total_evicted += bytes_evicted;
if (total_evicted < bytes)
kpreempt(KPREEMPT_SYNC);
else
break;
}
if (use_evcttq) {
@ -4887,7 +4900,7 @@ arc_evict_cb(void *arg, zthr_t *zthr)
*/
arc_evict_waiter_t *aw;
while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
cv_broadcast(&aw->aew_cv);
cv_signal(&aw->aew_cv);
}
arc_set_need_free();
}
@ -5168,9 +5181,8 @@ arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve)
uint64_t last_count = 0;
mutex_enter(&arc_evict_lock);
if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters);
arc_evict_waiter_t *last;
if ((last = list_tail(&arc_evict_waiters)) != NULL) {
last_count = last->aew_count;
} else if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
@ -11288,6 +11300,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
"The number of headers to evict per sublist before moving to the next");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batches_limit, UINT, ZMOD_RW,
"The number of batches to run per parallel eviction task");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
"Number of arc_prune threads");