L2ARC: Scan-based depth cap for persistent markers

With persistent markers and inclusive scanning, the marker traverses the
entire ARC state across many feed cycles, writing buffers far from the
tail that may no longer be relevant.

Track cumulative bytes scanned per pass in l2arc_ext_scanned. When scans
reach l2arc_ext_headroom_pct (default 25%) of the ARC state size, reset
the pass markers to the tail via lazy reset flags. This keeps markers
focused on the tail zone where buffers soon to be evicted have the most
value for L2ARC.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #18289
This commit is contained in:
Ameer Hamza 2026-03-05 20:13:26 +05:00 committed by Brian Behlendorf
parent 15fc3d64c8
commit 62ca8f721b
4 changed files with 81 additions and 0 deletions

View File

@ -46,6 +46,10 @@ extern "C" {
* and each of the states has two types: data and metadata.
*/
#define L2ARC_FEED_TYPES 4
#define L2ARC_MFU_META 0
#define L2ARC_MRU_META 1
#define L2ARC_MFU_DATA 2
#define L2ARC_MRU_DATA 3
/*
* L2ARC state and statistics for persistent marker management.
@ -62,6 +66,12 @@ typedef struct l2arc_info {
boolean_t *l2arc_sublist_busy[L2ARC_FEED_TYPES];
boolean_t *l2arc_sublist_reset[L2ARC_FEED_TYPES];
kmutex_t l2arc_sublist_lock; /* protects busy/reset flags */
/*
* Cumulative bytes scanned per pass since marker reset.
* Limits how far persistent markers advance from tail
* before resetting, based on % of state size.
*/
uint64_t l2arc_ext_scanned[L2ARC_FEED_TYPES];
int l2arc_next_sublist[L2ARC_FEED_TYPES]; /* round-robin */
} l2arc_info_t;

View File

@ -227,6 +227,12 @@ to enable caching/reading prefetches to/from L2ARC.
.It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int
No reads during writes.
.
.It Sy l2arc_ext_headroom_pct Ns = Ns Sy 25 Pq u64
Percentage of each ARC state's size that a pass may scan before
resetting its markers to the tail.
Lower values keep the marker closer to the tail under active workloads.
Set to 0 to disable the depth cap.
.
.It Sy l2arc_write_max Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
Maximum write rate in bytes per second for each L2ARC device.
Used directly during initial fill, when DWPD limiting is disabled,

View File

@ -956,6 +956,13 @@ int l2arc_exclude_special = 0;
*/
static int l2arc_mfuonly = 0;
/*
* Depth cap as percentage of state size. Each pass resets its markers
* to tail after scanning this fraction of the state. Keeps markers
* focused on the tail zone where L2ARC adds the most value.
*/
static uint64_t l2arc_ext_headroom_pct = 25;
/*
* L2ARC TRIM
* l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
@ -9083,6 +9090,8 @@ l2arc_pool_markers_init(spa_t *spa)
spa->spa_l2arc_info.l2arc_markers[pass][i]);
multilist_sublist_unlock(mls);
}
spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0;
}
}
@ -9875,6 +9884,31 @@ l2arc_blk_fetch_done(zio_t *zio)
kmem_free(cb, sizeof (l2arc_read_callback_t));
}
/*
* Return the total size of the ARC state corresponding to the given
* L2ARC pass number (0..3).
*/
static uint64_t
l2arc_get_state_size(int pass)
{
switch (pass) {
case L2ARC_MFU_META:
return (zfs_refcount_count(
&arc_mfu->arcs_size[ARC_BUFC_METADATA]));
case L2ARC_MRU_META:
return (zfs_refcount_count(
&arc_mru->arcs_size[ARC_BUFC_METADATA]));
case L2ARC_MFU_DATA:
return (zfs_refcount_count(
&arc_mfu->arcs_size[ARC_BUFC_DATA]));
case L2ARC_MRU_DATA:
return (zfs_refcount_count(
&arc_mru->arcs_size[ARC_BUFC_DATA]));
default:
return (0);
}
}
/*
* Flag all sublists for a single pass for lazy marker reset to tail.
* Each sublist's marker will be reset when next visited by a feed thread.
@ -9892,6 +9926,8 @@ l2arc_flag_pass_reset(spa_t *spa, int pass)
spa->spa_l2arc_info.l2arc_sublist_reset[pass][i] = B_TRUE;
multilist_sublist_unlock(mls);
}
spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0;
}
/*
@ -10044,6 +10080,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
(spa->spa_l2arc_info.l2arc_next_sublist[pass] + 1) %
num_sublists;
/*
* Depth cap: track cumulative bytes scanned per pass
* and reset markers when the scan cap is reached.
* Keeps the marker near the tail where L2ARC adds
* the most value.
*/
if (save_position) {
mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
spa->spa_l2arc_info.l2arc_ext_scanned[pass] +=
consumed_headroom;
uint64_t state_sz = l2arc_get_state_size(pass);
uint64_t scan_cap =
state_sz * l2arc_ext_headroom_pct / 100;
if (scan_cap > 0 &&
spa->spa_l2arc_info.l2arc_ext_scanned[pass] >=
scan_cap) {
l2arc_flag_pass_reset(spa, pass);
}
mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
}
if (full == B_TRUE)
break;
}
@ -11691,6 +11752,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
"Exclude dbufs on special vdevs from being cached to L2ARC if set.");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, ext_headroom_pct, U64, ZMOD_RW,
"Depth cap as percentage of state size for marker reset");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");

View File

@ -47,6 +47,7 @@ INITIALIZE_VALUE initialize_value zfs_initialize_value
KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export
LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit
L2ARC_DWPD_LIMIT l2arc.dwpd_limit l2arc_dwpd_limit
L2ARC_EXT_HEADROOM_PCT l2arc.ext_headroom_pct l2arc_ext_headroom_pct
L2ARC_MFUONLY l2arc.mfuonly l2arc_mfuonly
L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch
L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size