diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 6134ac1f5..dbe712e2e 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -469,6 +469,12 @@ typedef struct l2arc_dev { boolean_t l2ad_thread_exit; /* signal thread exit */ kmutex_t l2ad_feed_thr_lock; /* thread sleep/wake */ kcondvar_t l2ad_feed_cv; /* thread wakeup cv */ + /* + * Consecutive cycles where metadata filled write budget + * while data passes got nothing written. Used to detect + * monopolization and skip metadata to give data a chance. + */ + uint64_t l2ad_meta_cycles; } l2arc_dev_t; /* diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d01a36821..509b3109a 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -233,6 +233,13 @@ resetting its markers to the tail. Lower values keep the marker closer to the tail under active workloads. Set to 0 to disable the depth cap. . +.It Sy l2arc_meta_cycles Ns = Ns Sy 2 Pq u64 +How many consecutive cycles metadata may monopolize the write budget +before being skipped to let data run. +The default of 2 gives metadata roughly 67% and data 33% of L2ARC +write bandwidth under sustained load. +Higher values favor metadata; set to 0 to disable. +. .It Sy l2arc_write_max Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 Maximum write rate in bytes per second for each L2ARC device. Used directly during initial fill, when DWPD limiting is disabled, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d59ce0225..166304959 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -963,6 +963,15 @@ static int l2arc_mfuonly = 0; */ static uint64_t l2arc_ext_headroom_pct = 25; +/* + * Metadata monopolization limit. When metadata fills the write budget + * for this many consecutive cycles while data gets nothing, skip metadata + * for one cycle to let data run, then reset the counter. + * With N=2, the steady-state pattern under sustained monopolization is + * 2 metadata cycles followed by 1 data cycle (67%/33% split). + */ +static uint64_t l2arc_meta_cycles = 2; + /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of @@ -9998,6 +10007,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * Copy buffers for L2ARC writing. */ + boolean_t skip_meta = (save_position && + l2arc_meta_cycles > 0 && + dev->l2ad_meta_cycles >= l2arc_meta_cycles); + if (skip_meta) + dev->l2ad_meta_cycles = 0; + for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * pass == 0: MFU meta @@ -10013,6 +10028,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) continue; } + if (skip_meta && pass <= L2ARC_MRU_META) + continue; + headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; @@ -10080,6 +10098,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) (spa->spa_l2arc_info.l2arc_next_sublist[pass] + 1) % num_sublists; + /* + * Count consecutive metadata monopolization toward + * l2arc_meta_cycles. Only count when metadata actually + * filled the write budget, starving data passes. + */ + if (save_position && pass <= L2ARC_MRU_META && full) + dev->l2ad_meta_cycles++; + /* * Depth cap: track cumulative bytes scanned per pass * and reset markers when the scan cap is reached. @@ -10109,6 +10135,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + /* + * If nothing was written at all, reset monopolization counter. + * No point skipping metadata if data has nothing either. + */ + if (write_asize == 0) + dev->l2ad_meta_cycles = 0; + /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_psize); @@ -11752,6 +11785,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_cycles, U64, ZMOD_RW, + "Consecutive metadata cycles before skipping to let data run"); + ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, ext_headroom_pct, U64, ZMOD_RW, "Depth cap as percentage of state size for marker reset"); diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 4008bfe2b..e2b829b84 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -53,6 +53,7 @@ L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled L2ARC_TRIM_AHEAD l2arc.trim_ahead l2arc_trim_ahead +L2ARC_META_CYCLES l2arc.meta_cycles l2arc_meta_cycles L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc LIVELIST_CONDENSE_SYNC_CANCEL livelist.condense.sync_cancel zfs_livelist_condense_sync_cancel