ARC: parallel eviction

On systems with enormous amounts of memory, the single arc_evict thread
can become a bottleneck if reads and writes are stuck behind it, waiting
for old data to be evicted before new data can take its place.

This commit adds support for evicting from multiple ARC lists in
parallel, by farming the evict work out to some number of threads and
then accumulating their results.

A new tuneable, zfs_arc_evict_threads, sets the number of threads. By
default, it will scale based on the number of CPUs.

Sponsored-by: Expensify, Inc.
Sponsored-by: Klara, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Youzhong Yang <youzhong@gmail.com>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Mateusz Piotrowski <mateusz.piotrowski@klarasystems.com>
Signed-off-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Mateusz Piotrowski <mateusz.piotrowski@klarasystems.com>
Co-authored-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com>
Closes #16486
This commit is contained in:
Allan Jude 2025-05-14 10:38:32 -04:00 committed by GitHub
parent 89a8a91582
commit b6916f995e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 233 additions and 21 deletions

View File

@ -3,7 +3,7 @@
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024 Klara, Inc.
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
@ -17,9 +17,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.\" Copyright (c) 2024, Klara, Inc.
.\"
.Dd November 1, 2024
.Dd May 7, 2025
.Dt ZFS 4
.Os
.
@ -740,6 +738,40 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
This batch-style operation prevents entire sub-lists from being evicted at once
but comes at a cost of additional unlocking and locking.
.
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
Sets the number of ARC eviction threads to be used.
.Pp
If set greater than 0, ZFS will dedicate up to that many threads to ARC
eviction.
Each thread will process one sub-list at a time,
until the eviction target is reached or all sub-lists have been processed.
When set to 0, ZFS will compute a reasonable number of eviction threads based
on the number of CPUs.
.TS
box;
lb l l .
CPUs Threads
_
1-4 1
5-8 2
9-15 3
16-31 4
32-63 6
64-95 8
96-127 9
128-160 11
160-191 12
192-223 13
224-255 14
256+ 16
.TE
.Pp
More threads may improve the responsiveness of ZFS to memory pressure.
This can be important for performance when eviction from the ARC becomes
a bottleneck for reads and writes.
.Pp
This parameter can only be set at module load time.
.
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
If set to a non zero value, it will replace the
.Sy arc_grow_retry

View File

@ -27,7 +27,7 @@
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2020, George Amanakis. All rights reserved.
* Copyright (c) 2019, 2024, Klara Inc.
* Copyright (c) 2019, 2024, 2025, Klara, Inc.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2020, The FreeBSD Foundation [1]
* Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@ -337,6 +337,9 @@ static kmutex_t arc_evict_lock;
static boolean_t arc_evict_needed = B_FALSE;
static clock_t arc_last_uncached_flush;
static taskq_t *arc_evict_taskq;
static struct evict_arg *arc_evict_arg;
/*
* Count of bytes evicted since boot.
*/
@ -470,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
/* Used by spa_export/spa_destroy to flush the arc asynchronously */
static taskq_t *arc_flush_taskq;
/*
* Controls the number of ARC eviction threads to dispatch sublists to.
*
* Possible values:
* 0 (auto) compute the number of threads using a logarithmic formula.
* 1 (disabled) one thread - parallel eviction is disabled.
* 2+ (manual) set the number manually.
*
* See arc_evict_thread_init() for how "auto" is computed.
*/
static uint_t zfs_arc_evict_threads = 0;
/* The 7 states: */
arc_state_t ARC_anon;
arc_state_t ARC_mru;
@ -4049,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
kmem_free(markers, sizeof (*markers) * count);
}
typedef struct evict_arg {
taskq_ent_t eva_tqent;
multilist_t *eva_ml;
arc_buf_hdr_t *eva_marker;
int eva_idx;
uint64_t eva_spa;
uint64_t eva_bytes;
uint64_t eva_evicted;
} evict_arg_t;
static void
arc_evict_task(void *arg)
{
evict_arg_t *eva = arg;
eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
eva->eva_marker, eva->eva_spa, eva->eva_bytes);
}
static void
arc_evict_thread_init(void)
{
if (zfs_arc_evict_threads == 0) {
/*
* Compute number of threads we want to use for eviction.
*
* Normally, it's log2(ncpus) + ncpus/32, which gets us to the
* default max of 16 threads at ~256 CPUs.
*
* However, that formula goes to two threads at 4 CPUs, which
* is still rather to low to be really useful, so we just go
* with 1 thread at fewer than 6 cores.
*/
if (max_ncpus < 6)
zfs_arc_evict_threads = 1;
else
zfs_arc_evict_threads =
(highbit64(max_ncpus) - 1) + max_ncpus / 32;
} else if (zfs_arc_evict_threads > max_ncpus)
zfs_arc_evict_threads = max_ncpus;
if (zfs_arc_evict_threads > 1) {
arc_evict_taskq = taskq_create("arc_evict",
zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
TASKQ_PREPOPULATE);
arc_evict_arg = kmem_zalloc(
sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
}
}
/*
* The minimum number of bytes we can evict at once is a block size.
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
* We use this value to compute a scaling factor for the eviction tasks.
*/
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
/*
* Evict buffers from the given arc state, until we've removed the
* specified number of bytes. Move the removed buffers to the
@ -4070,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
multilist_t *ml = &state->arcs_list[type];
int num_sublists;
arc_buf_hdr_t **markers;
evict_arg_t *eva = NULL;
num_sublists = multilist_get_num_sublists(ml);
boolean_t use_evcttq = zfs_arc_evict_threads > 1;
/*
* If we've tried to evict from each sublist, made some
* progress, but still have not hit the target number of bytes
@ -4094,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
multilist_sublist_unlock(mls);
}
if (use_evcttq) {
if (zthr_iscurthread(arc_evict_zthr))
eva = arc_evict_arg;
else
eva = kmem_alloc(sizeof (evict_arg_t) *
zfs_arc_evict_threads, KM_NOSLEEP);
if (eva) {
for (int i = 0; i < zfs_arc_evict_threads; i++) {
taskq_init_ent(&eva[i].eva_tqent);
eva[i].eva_ml = ml;
eva[i].eva_spa = spa;
}
} else {
/*
* Fall back to the regular single evict if it is not
* possible to allocate memory for the taskq entries.
*/
use_evcttq = B_FALSE;
}
}
/*
* Start eviction using a randomly selected sublist, this is to try and
* evenly balance eviction across all sublists. Always starting at the
* same sublist (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
uint64_t scan_evicted = 0;
int sublists_left = num_sublists;
int sublist_idx = multilist_get_random_index(ml);
/*
* While we haven't hit our target number of bytes to evict, or
* we're evicting all available buffers.
*/
while (total_evicted < bytes) {
int sublist_idx = multilist_get_random_index(ml);
uint64_t scan_evicted = 0;
uint64_t evict = MIN_EVICT_SIZE;
uint_t ntasks = zfs_arc_evict_threads;
/*
* Start eviction using a randomly selected sublist,
* this is to try and evenly balance eviction across all
* sublists. Always starting at the same sublist
* (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
for (int i = 0; i < num_sublists; i++) {
if (use_evcttq) {
if (sublists_left < ntasks)
ntasks = sublists_left;
if (ntasks < 2)
use_evcttq = B_FALSE;
}
if (use_evcttq) {
uint64_t left = bytes - total_evicted;
if (bytes == ARC_EVICT_ALL) {
evict = bytes;
} else if (left > ntasks * MIN_EVICT_SIZE) {
evict = DIV_ROUND_UP(left, ntasks);
} else {
ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
if (ntasks == 1)
use_evcttq = B_FALSE;
}
}
for (int i = 0; sublists_left > 0; i++, sublist_idx++,
sublists_left--) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;
/* we've reached the end, wrap to the beginning */
if (sublist_idx >= num_sublists)
sublist_idx = 0;
if (use_evcttq) {
if (i == ntasks)
break;
eva[i].eva_marker = markers[sublist_idx];
eva[i].eva_idx = sublist_idx;
eva[i].eva_bytes = evict;
taskq_dispatch_ent(arc_evict_taskq,
arc_evict_task, &eva[i], 0,
&eva[i].eva_tqent);
continue;
}
if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
@ -4123,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
scan_evicted += bytes_evicted;
total_evicted += bytes_evicted;
}
/* we've reached the end, wrap to the beginning */
if (++sublist_idx >= num_sublists)
sublist_idx = 0;
if (use_evcttq) {
taskq_wait(arc_evict_taskq);
for (int i = 0; i < ntasks; i++) {
scan_evicted += eva[i].eva_evicted;
total_evicted += eva[i].eva_evicted;
}
}
/*
* If we didn't evict anything during this scan, we have
* no reason to believe we'll evict more during another
* If we scanned all sublists and didn't evict anything, we
* have no reason to believe we'll evict more during another
* scan, so break the loop.
*/
if (scan_evicted == 0) {
if (scan_evicted == 0 && sublists_left == 0) {
/* This isn't possible, let's make that obvious */
ASSERT3S(bytes, !=, 0);
@ -4151,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
break;
}
/*
* If we scanned all sublists but still have more to do,
* reset the counts so we can go around again.
*/
if (sublists_left == 0) {
sublists_left = num_sublists;
sublist_idx = multilist_get_random_index(ml);
scan_evicted = 0;
/*
* Since we're about to reconsider all sublists,
* re-enable use of the evict threads if available.
*/
use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
}
}
if (eva != NULL && eva != arc_evict_arg)
kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
}
if (markers != arc_state_evict_markers)
arc_state_free_markers(markers, num_sublists);
@ -7805,6 +7970,7 @@ arc_set_limits(uint64_t allmem)
/* How to set default max varies by platform. */
arc_c_max = arc_default_max(arc_c_min, allmem);
}
void
arc_init(void)
{
@ -7882,6 +8048,8 @@ arc_init(void)
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
arc_evict_thread_init();
list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
offsetof(arc_async_flush_t, af_node));
mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
@ -7982,11 +8150,20 @@ arc_fini(void)
list_destroy(&arc_prune_list);
mutex_destroy(&arc_prune_mtx);
if (arc_evict_taskq != NULL)
taskq_wait(arc_evict_taskq);
(void) zthr_cancel(arc_evict_zthr);
(void) zthr_cancel(arc_reap_zthr);
arc_state_free_markers(arc_state_evict_markers,
arc_state_evict_marker_count);
if (arc_evict_taskq != NULL) {
taskq_destroy(arc_evict_taskq);
kmem_free(arc_evict_arg,
sizeof (evict_arg_t) * zfs_arc_evict_threads);
}
mutex_destroy(&arc_evict_lock);
list_destroy(&arc_evict_waiters);
@ -11110,3 +11287,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
"Number of arc_prune threads");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
"Number of threads to use for ARC eviction.");