diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index b1bea24ac..6134ac1f5 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -46,6 +46,10 @@ extern "C" { * and each of the states has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 +#define L2ARC_MFU_META 0 +#define L2ARC_MRU_META 1 +#define L2ARC_MFU_DATA 2 +#define L2ARC_MRU_DATA 3 /* * L2ARC state and statistics for persistent marker management. @@ -62,6 +66,12 @@ typedef struct l2arc_info { boolean_t *l2arc_sublist_busy[L2ARC_FEED_TYPES]; boolean_t *l2arc_sublist_reset[L2ARC_FEED_TYPES]; kmutex_t l2arc_sublist_lock; /* protects busy/reset flags */ + /* + * Cumulative bytes scanned per pass since marker reset. + * Limits how far persistent markers advance from tail + * before resetting, based on % of state size. + */ + uint64_t l2arc_ext_scanned[L2ARC_FEED_TYPES]; int l2arc_next_sublist[L2ARC_FEED_TYPES]; /* round-robin */ } l2arc_info_t; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 97c0ac6ab..d01a36821 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -227,6 +227,12 @@ to enable caching/reading prefetches to/from L2ARC. .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int No reads during writes. . +.It Sy l2arc_ext_headroom_pct Ns = Ns Sy 25 Pq u64 +Percentage of each ARC state's size that a pass may scan before +resetting its markers to the tail. +Lower values keep the marker closer to the tail under active workloads. +Set to 0 to disable the depth cap. +. .It Sy l2arc_write_max Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 Maximum write rate in bytes per second for each L2ARC device. Used directly during initial fill, when DWPD limiting is disabled, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 736faaf51..d59ce0225 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -956,6 +956,13 @@ int l2arc_exclude_special = 0; */ static int l2arc_mfuonly = 0; +/* + * Depth cap as percentage of state size. Each pass resets its markers + * to tail after scanning this fraction of the state. Keeps markers + * focused on the tail zone where L2ARC adds the most value. + */ +static uint64_t l2arc_ext_headroom_pct = 25; + /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of @@ -9083,6 +9090,8 @@ l2arc_pool_markers_init(spa_t *spa) spa->spa_l2arc_info.l2arc_markers[pass][i]); multilist_sublist_unlock(mls); } + + spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0; } } @@ -9875,6 +9884,31 @@ l2arc_blk_fetch_done(zio_t *zio) kmem_free(cb, sizeof (l2arc_read_callback_t)); } +/* + * Return the total size of the ARC state corresponding to the given + * L2ARC pass number (0..3). + */ +static uint64_t +l2arc_get_state_size(int pass) +{ + switch (pass) { + case L2ARC_MFU_META: + return (zfs_refcount_count( + &arc_mfu->arcs_size[ARC_BUFC_METADATA])); + case L2ARC_MRU_META: + return (zfs_refcount_count( + &arc_mru->arcs_size[ARC_BUFC_METADATA])); + case L2ARC_MFU_DATA: + return (zfs_refcount_count( + &arc_mfu->arcs_size[ARC_BUFC_DATA])); + case L2ARC_MRU_DATA: + return (zfs_refcount_count( + &arc_mru->arcs_size[ARC_BUFC_DATA])); + default: + return (0); + } +} + /* * Flag all sublists for a single pass for lazy marker reset to tail. * Each sublist's marker will be reset when next visited by a feed thread. @@ -9892,6 +9926,8 @@ l2arc_flag_pass_reset(spa_t *spa, int pass) spa->spa_l2arc_info.l2arc_sublist_reset[pass][i] = B_TRUE; multilist_sublist_unlock(mls); } + + spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0; } /* @@ -10044,6 +10080,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) (spa->spa_l2arc_info.l2arc_next_sublist[pass] + 1) % num_sublists; + /* + * Depth cap: track cumulative bytes scanned per pass + * and reset markers when the scan cap is reached. + * Keeps the marker near the tail where L2ARC adds + * the most value. + */ + if (save_position) { + mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock); + + spa->spa_l2arc_info.l2arc_ext_scanned[pass] += + consumed_headroom; + + uint64_t state_sz = l2arc_get_state_size(pass); + uint64_t scan_cap = + state_sz * l2arc_ext_headroom_pct / 100; + + if (scan_cap > 0 && + spa->spa_l2arc_info.l2arc_ext_scanned[pass] >= + scan_cap) { + l2arc_flag_pass_reset(spa, pass); + } + + mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock); + } + if (full == B_TRUE) break; } @@ -11691,6 +11752,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, ext_headroom_pct, U64, ZMOD_RW, + "Depth cap as percentage of state size for marker reset"); + ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 5e6959a54..4008bfe2b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -47,6 +47,7 @@ INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit L2ARC_DWPD_LIMIT l2arc.dwpd_limit l2arc_dwpd_limit +L2ARC_EXT_HEADROOM_PCT l2arc.ext_headroom_pct l2arc_ext_headroom_pct L2ARC_MFUONLY l2arc.mfuonly l2arc_mfuonly L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size