L2ARC: Implement DWPD-based rate limiting with adaptive feed intervals

Add DWPD (Drive Writes Per Day) rate limiting to control L2ARC write
speeds and protect SSD endurance. Write rate is constrained by the
minimum of l2arc_write_max and DWPD-calculated budget. Devices
accumulate unused write budget over 24-hour periods with automatic reset
and carry-over. Writes occur in controlled bursts (max 50MB) with
adaptive intervals to achieve target rates. Applies after initial device
fill.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #18093
This commit is contained in:
Ameer Hamza 2025-09-25 23:24:03 +05:00 committed by Brian Behlendorf
parent b525525b44
commit d1f290f1ea
5 changed files with 184 additions and 46 deletions

View File

@ -65,6 +65,9 @@
#define param_set_arc_max_args(var) \
CTLTYPE_U64, NULL, 0, param_set_arc_max, "QU"
#define param_set_l2arc_dwpd_limit_args(var) \
CTLTYPE_U64, &var, 0, param_set_l2arc_dwpd_limit, "QU"
#define param_set_arc_free_target_args(var) \
CTLTYPE_UINT, NULL, 0, param_set_arc_free_target, "IU"

View File

@ -442,6 +442,13 @@ typedef struct l2arc_dev {
*/
zfs_refcount_t l2ad_lb_count;
boolean_t l2ad_trim_all; /* TRIM whole device */
/*
* DWPD tracking with daily reset
*/
uint64_t l2ad_dwpd_writes; /* 24h bytes written */
uint64_t l2ad_dwpd_start; /* 24h period start */
uint64_t l2ad_dwpd_accumulated; /* Accumulated */
uint64_t l2ad_dwpd_bump; /* Reset trigger */
/*
* Per-device feed thread for parallel L2ARC writes
*/
@ -1087,6 +1094,7 @@ extern uint_t zfs_arc_pc_percent;
extern uint_t arc_lotsfree_percent;
extern uint64_t zfs_arc_min;
extern uint64_t zfs_arc_max;
extern uint64_t l2arc_dwpd_limit;
extern uint64_t arc_reduce_target_size(uint64_t to_free);
extern boolean_t arc_reclaim_needed(void);
@ -1106,6 +1114,8 @@ extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
extern int param_set_l2arc_dwpd_limit(ZFS_MODULE_PARAM_ARGS);
extern void l2arc_dwpd_bump_reset(void);
/* used in zdb.c */
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,

View File

@ -163,6 +163,22 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS)
return (0);
}
int
param_set_l2arc_dwpd_limit(SYSCTL_HANDLER_ARGS)
{
uint64_t old_val = l2arc_dwpd_limit;
int err;
err = sysctl_handle_64(oidp, arg1, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
if (l2arc_dwpd_limit != old_val)
l2arc_dwpd_bump_reset();
return (0);
}
int
param_set_arc_max(SYSCTL_HANDLER_ARGS)
{

View File

@ -410,6 +410,22 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
return (0);
}
int
param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp)
{
uint64_t old_val = l2arc_dwpd_limit;
int error;
error = spl_param_set_u64(buf, kp);
if (error < 0)
return (SET_ERROR(error));
if (l2arc_dwpd_limit != old_val)
l2arc_dwpd_bump_reset();
return (0);
}
#ifdef CONFIG_MEMORY_HOTPLUG
static int
arc_hotplug_callback(struct notifier_block *self, unsigned long action,

View File

@ -820,7 +820,8 @@ typedef struct arc_async_flush {
* Level 2 ARC
*/
#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */
#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
#define L2ARC_BURST_SIZE_MAX (64 * 1024 * 1024) /* max burst size */
#define L2ARC_HEADROOM 8 /* num of writes */
/*
@ -831,9 +832,20 @@ typedef struct arc_async_flush {
#define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
/*
* Min L2ARC capacity to enable persistent markers, adaptive intervals, and
* DWPD rate limiting. L2ARC must be at least twice arc_c_max to benefit from
* inclusive caching - smaller L2ARC would either cyclically overwrite itself
* (if L2ARC < ARC) or merely duplicate ARC contents (if L2ARC = ARC).
* With L2ARC >= 2*ARC, there's room for ARC duplication plus additional
* cached data.
*/
#define L2ARC_PERSIST_THRESHOLD (arc_c_max * 2)
/* L2ARC Performance Tunables */
static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
uint64_t l2arc_dwpd_limit = 100; /* 100 = 1.0 DWPD */
static uint64_t l2arc_dwpd_bump = 0; /* DWPD reset trigger */
static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
@ -919,6 +931,7 @@ static void l2arc_read_done(zio_t *);
static void l2arc_do_free_on_write(l2arc_dev_t *dev);
static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
boolean_t state_only);
static uint64_t l2arc_get_write_rate(l2arc_dev_t *dev);
static void arc_prune_async(uint64_t adjust);
@ -8363,7 +8376,7 @@ arc_fini(void)
* may be necessary for different workloads:
*
* l2arc_write_max max write bytes per interval
* l2arc_write_boost extra write bytes during device warmup
* l2arc_dwpd_limit device write endurance limit (100 = 1.0 DWPD)
* l2arc_noprefetch skip caching prefetched buffers
* l2arc_headroom number of max device writes to precache
* l2arc_headroom_boost when we find compressed buffers during ARC
@ -8380,7 +8393,6 @@ arc_fini(void)
*
* l2arc_write_eligible() check if a buffer is eligible to cache
* l2arc_write_size() calculate how much to write
* l2arc_write_interval() calculate sleep delay between writes
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
@ -8501,24 +8513,22 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
}
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
l2arc_write_size(l2arc_dev_t *dev, clock_t *interval)
{
uint64_t size;
uint64_t write_rate = l2arc_get_write_rate(dev);
/*
* Make sure our globals have meaningful values in case the user
* altered them.
*/
size = l2arc_write_max;
if (size == 0) {
cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
"resetting it to the default (%d)", L2ARC_WRITE_SIZE);
size = l2arc_write_max = L2ARC_WRITE_SIZE;
if (write_rate > L2ARC_BURST_SIZE_MAX) {
/* Calculate interval to achieve desired rate with burst cap */
uint64_t feeds_per_sec =
MAX(DIV_ROUND_UP(write_rate, L2ARC_BURST_SIZE_MAX), 1);
*interval = hz / feeds_per_sec;
size = write_rate / feeds_per_sec;
} else {
*interval = hz; /* 1 second default */
size = write_rate;
}
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
/* We need to add in the worst case scenario of log block overhead. */
size += l2arc_log_blk_overhead(size, dev);
if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
@ -8543,28 +8553,6 @@ l2arc_write_size(l2arc_dev_t *dev)
}
static clock_t
l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
{
clock_t interval, next, now;
/*
* If the ARC lists are busy, increase our write rate; if the
* lists are stale, idle back. This is achieved by checking
* how much we previously wrote - if it was more than half of
* what we wanted, schedule the next write much sooner.
*/
if (l2arc_feed_again && wrote > (wanted / 2))
interval = (hz * l2arc_feed_min_ms) / 1000;
else
interval = hz * l2arc_feed_secs;
now = ddi_get_lbolt();
next = MAX(now, MIN(now + interval, began + interval));
return (next);
}
/*
* Free buffers that were tagged for destruction.
*/
@ -9170,6 +9158,85 @@ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
}
}
/*
* Bump the DWPD generation to trigger stats reset on all devices.
*/
void
l2arc_dwpd_bump_reset(void)
{
l2arc_dwpd_bump++;
}
/*
* Calculate DWPD rate limit for L2ARC device.
*/
static uint64_t
l2arc_dwpd_rate_limit(l2arc_dev_t *dev)
{
uint64_t device_size = dev->l2ad_end - dev->l2ad_start;
uint64_t daily_budget = (device_size * l2arc_dwpd_limit) / 100;
uint64_t now = gethrestime_sec();
/* Reset stats on param change or daily period expiry */
if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump ||
(now - dev->l2ad_dwpd_start) >= 24 * 3600) {
if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump) {
/* Full reset on param change, no carryover */
dev->l2ad_dwpd_accumulated = 0;
dev->l2ad_dwpd_bump = l2arc_dwpd_bump;
} else {
/* Save unused budget from last period (max 1 day) */
if (dev->l2ad_dwpd_writes >= daily_budget)
dev->l2ad_dwpd_accumulated = 0;
else
dev->l2ad_dwpd_accumulated =
daily_budget - dev->l2ad_dwpd_writes;
}
dev->l2ad_dwpd_writes = 0;
dev->l2ad_dwpd_start = now;
}
uint64_t elapsed = now - dev->l2ad_dwpd_start;
uint64_t remaining_secs = MAX((24 * 3600) - elapsed, 1);
/* Add burst allowance for the first write after device wrap */
uint64_t total_budget = daily_budget + dev->l2ad_dwpd_accumulated +
L2ARC_BURST_SIZE_MAX;
if (dev->l2ad_dwpd_writes >= total_budget)
return (0);
return ((total_budget - dev->l2ad_dwpd_writes) / remaining_secs);
}
/*
* Get write rate based on device state and DWPD configuration.
*/
static uint64_t
l2arc_get_write_rate(l2arc_dev_t *dev)
{
uint64_t write_max = l2arc_write_max;
spa_t *spa = dev->l2ad_spa;
/*
* Make sure l2arc_write_max is valid in case user altered it.
*/
if (write_max == 0) {
cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
"resetting it to the default (%d)", L2ARC_WRITE_SIZE);
write_max = l2arc_write_max = L2ARC_WRITE_SIZE;
}
/* Apply DWPD rate limit for persistent marker configurations */
if (!dev->l2ad_first && l2arc_dwpd_limit > 0 &&
spa->spa_l2arc_info.l2arc_total_capacity >=
L2ARC_PERSIST_THRESHOLD) {
uint64_t dwpd_rate = l2arc_dwpd_rate_limit(dev);
return (MIN(dwpd_rate, write_max));
}
return (write_max);
}
/*
* Evict buffers from the device write hand to the distance specified in
* bytes. This distance may span populated buffers, it may span nothing.
@ -9379,6 +9446,14 @@ out:
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
/*
* Reset DWPD counters - first pass writes are free, start
* fresh 24h budget period now that device is full.
*/
dev->l2ad_dwpd_writes = 0;
dev->l2ad_dwpd_start = gethrestime_sec();
dev->l2ad_dwpd_accumulated = 0;
dev->l2ad_dwpd_bump = l2arc_dwpd_bump;
goto top;
}
@ -9852,9 +9927,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* vs ARC size. Use persistent markers for pools with significant
* L2ARC investment, otherwise use simple HEAD/TAIL scanning.
*/
uint64_t threshold = MIN((arc_c_max / 4), arc_c);
boolean_t save_position =
(spa->spa_l2arc_info.l2arc_total_capacity >= threshold);
(spa->spa_l2arc_info.l2arc_total_capacity >=
L2ARC_PERSIST_THRESHOLD);
/*
* Check if markers need reset based on smallest device threshold.
@ -9988,6 +10063,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
spa->spa_l2arc_info.l2arc_total_writes += write_asize;
mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
/* Track writes for DWPD rate limiting */
dev->l2ad_dwpd_writes += write_asize;
/*
* Update the device header after the zio completes as
* l2arc_write_done() may have updated the memory holding the log block
@ -10070,7 +10148,8 @@ l2arc_feed_thread(void *arg)
ARCSTAT_BUMP(arcstat_l2_feeds);
size = l2arc_write_size(dev);
clock_t interval;
size = l2arc_write_size(dev, &interval);
/*
* Evict L2ARC buffers that will be overwritten.
@ -10083,9 +10162,18 @@ l2arc_feed_thread(void *arg)
wrote = l2arc_write_buffers(spa, dev, size);
/*
* Calculate interval between writes.
* Adjust interval based on actual write.
*/
next = l2arc_write_interval(begin, size, wrote);
if (wrote == 0)
interval = hz * l2arc_feed_secs;
else if (wrote < size)
interval = (interval * wrote) / size;
/*
* Calculate next feed time.
*/
clock_t now = ddi_get_lbolt();
next = MAX(now, MIN(now + interval, begin + interval));
spa_config_exit(spa, SCL_L2ARC, dev);
}
spl_fstrans_unmark(cookie);
@ -10255,6 +10343,10 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
adddev->l2ad_trim_all = B_FALSE;
adddev->l2ad_dwpd_writes = 0;
adddev->l2ad_dwpd_start = gethrestime_sec();
adddev->l2ad_dwpd_accumulated = 0;
adddev->l2ad_dwpd_bump = l2arc_dwpd_bump;
list_link_init(&adddev->l2ad_node);
adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
@ -11536,8 +11628,9 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
"Max write bytes per interval");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
"Extra write bytes during device warmup");
ZFS_MODULE_PARAM_CALL(zfs_l2arc, l2arc_, dwpd_limit, param_set_l2arc_dwpd_limit,
spl_param_get_u64, ZMOD_RW,
"L2ARC device endurance limit as percentage (100 = 1.0 DWPD)");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
"Number of max device writes to precache");