diff --git a/include/os/freebsd/spl/sys/mod.h b/include/os/freebsd/spl/sys/mod.h index 2aa66bbe1..c6425e888 100644 --- a/include/os/freebsd/spl/sys/mod.h +++ b/include/os/freebsd/spl/sys/mod.h @@ -65,6 +65,9 @@ #define param_set_arc_max_args(var) \ CTLTYPE_U64, NULL, 0, param_set_arc_max, "QU" +#define param_set_l2arc_dwpd_limit_args(var) \ + CTLTYPE_U64, &var, 0, param_set_l2arc_dwpd_limit, "QU" + #define param_set_arc_free_target_args(var) \ CTLTYPE_UINT, NULL, 0, param_set_arc_free_target, "IU" diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index fcaf9a10e..1c400c513 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -442,6 +442,13 @@ typedef struct l2arc_dev { */ zfs_refcount_t l2ad_lb_count; boolean_t l2ad_trim_all; /* TRIM whole device */ + /* + * DWPD tracking with daily reset + */ + uint64_t l2ad_dwpd_writes; /* 24h bytes written */ + uint64_t l2ad_dwpd_start; /* 24h period start */ + uint64_t l2ad_dwpd_accumulated; /* Accumulated */ + uint64_t l2ad_dwpd_bump; /* Reset trigger */ /* * Per-device feed thread for parallel L2ARC writes */ @@ -1087,6 +1094,7 @@ extern uint_t zfs_arc_pc_percent; extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; +extern uint64_t l2arc_dwpd_limit; extern uint64_t arc_reduce_target_size(uint64_t to_free); extern boolean_t arc_reclaim_needed(void); @@ -1106,6 +1114,8 @@ extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); +extern int param_set_l2arc_dwpd_limit(ZFS_MODULE_PARAM_ARGS); +extern void l2arc_dwpd_bump_reset(void); /* used in zdb.c */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 393bfaa65..a06e00d73 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -163,6 +163,22 @@ param_set_arc_int(SYSCTL_HANDLER_ARGS) return (0); } +int +param_set_l2arc_dwpd_limit(SYSCTL_HANDLER_ARGS) +{ + uint64_t old_val = l2arc_dwpd_limit; + int err; + + err = sysctl_handle_64(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (l2arc_dwpd_limit != old_val) + l2arc_dwpd_bump_reset(); + + return (0); +} + int param_set_arc_max(SYSCTL_HANDLER_ARGS) { diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 6478c834b..dbc9aad93 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -410,6 +410,22 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) return (0); } +int +param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t old_val = l2arc_dwpd_limit; + int error; + + error = spl_param_set_u64(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + if (l2arc_dwpd_limit != old_val) + l2arc_dwpd_bump_reset(); + + return (0); +} + #ifdef CONFIG_MEMORY_HOTPLUG static int arc_hotplug_callback(struct notifier_block *self, unsigned long action, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 09411466e..afcf7ba3d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -820,7 +820,8 @@ typedef struct arc_async_flush { * Level 2 ARC */ -#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */ +#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */ +#define L2ARC_BURST_SIZE_MAX (64 * 1024 * 1024) /* max burst size */ #define L2ARC_HEADROOM 8 /* num of writes */ /* @@ -831,9 +832,20 @@ typedef struct arc_async_flush { #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ +/* + * Min L2ARC capacity to enable persistent markers, adaptive intervals, and + * DWPD rate limiting. L2ARC must be at least twice arc_c_max to benefit from + * inclusive caching - smaller L2ARC would either cyclically overwrite itself + * (if L2ARC < ARC) or merely duplicate ARC contents (if L2ARC = ARC). + * With L2ARC >= 2*ARC, there's room for ARC duplication plus additional + * cached data. + */ +#define L2ARC_PERSIST_THRESHOLD (arc_c_max * 2) + /* L2ARC Performance Tunables */ static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +uint64_t l2arc_dwpd_limit = 100; /* 100 = 1.0 DWPD */ +static uint64_t l2arc_dwpd_bump = 0; /* DWPD reset trigger */ static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ @@ -919,6 +931,7 @@ static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(l2arc_dev_t *dev); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); +static uint64_t l2arc_get_write_rate(l2arc_dev_t *dev); static void arc_prune_async(uint64_t adjust); @@ -8363,7 +8376,7 @@ arc_fini(void) * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval - * l2arc_write_boost extra write bytes during device warmup + * l2arc_dwpd_limit device write endurance limit (100 = 1.0 DWPD) * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC @@ -8380,7 +8393,6 @@ arc_fini(void) * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write - * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. @@ -8501,24 +8513,22 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) } static uint64_t -l2arc_write_size(l2arc_dev_t *dev) +l2arc_write_size(l2arc_dev_t *dev, clock_t *interval) { uint64_t size; + uint64_t write_rate = l2arc_get_write_rate(dev); - /* - * Make sure our globals have meaningful values in case the user - * altered them. - */ - size = l2arc_write_max; - if (size == 0) { - cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " - "resetting it to the default (%d)", L2ARC_WRITE_SIZE); - size = l2arc_write_max = L2ARC_WRITE_SIZE; + if (write_rate > L2ARC_BURST_SIZE_MAX) { + /* Calculate interval to achieve desired rate with burst cap */ + uint64_t feeds_per_sec = + MAX(DIV_ROUND_UP(write_rate, L2ARC_BURST_SIZE_MAX), 1); + *interval = hz / feeds_per_sec; + size = write_rate / feeds_per_sec; + } else { + *interval = hz; /* 1 second default */ + size = write_rate; } - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - /* We need to add in the worst case scenario of log block overhead. */ size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { @@ -8543,28 +8553,6 @@ l2arc_write_size(l2arc_dev_t *dev) } -static clock_t -l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) -{ - clock_t interval, next, now; - - /* - * If the ARC lists are busy, increase our write rate; if the - * lists are stale, idle back. This is achieved by checking - * how much we previously wrote - if it was more than half of - * what we wanted, schedule the next write much sooner. - */ - if (l2arc_feed_again && wrote > (wanted / 2)) - interval = (hz * l2arc_feed_min_ms) / 1000; - else - interval = hz * l2arc_feed_secs; - - now = ddi_get_lbolt(); - next = MAX(now, MIN(now + interval, began + interval)); - - return (next); -} - /* * Free buffers that were tagged for destruction. */ @@ -9170,6 +9158,85 @@ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) } } +/* + * Bump the DWPD generation to trigger stats reset on all devices. + */ +void +l2arc_dwpd_bump_reset(void) +{ + l2arc_dwpd_bump++; +} + +/* + * Calculate DWPD rate limit for L2ARC device. + */ +static uint64_t +l2arc_dwpd_rate_limit(l2arc_dev_t *dev) +{ + uint64_t device_size = dev->l2ad_end - dev->l2ad_start; + uint64_t daily_budget = (device_size * l2arc_dwpd_limit) / 100; + uint64_t now = gethrestime_sec(); + + /* Reset stats on param change or daily period expiry */ + if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump || + (now - dev->l2ad_dwpd_start) >= 24 * 3600) { + if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump) { + /* Full reset on param change, no carryover */ + dev->l2ad_dwpd_accumulated = 0; + dev->l2ad_dwpd_bump = l2arc_dwpd_bump; + } else { + /* Save unused budget from last period (max 1 day) */ + if (dev->l2ad_dwpd_writes >= daily_budget) + dev->l2ad_dwpd_accumulated = 0; + else + dev->l2ad_dwpd_accumulated = + daily_budget - dev->l2ad_dwpd_writes; + } + dev->l2ad_dwpd_writes = 0; + dev->l2ad_dwpd_start = now; + } + + uint64_t elapsed = now - dev->l2ad_dwpd_start; + uint64_t remaining_secs = MAX((24 * 3600) - elapsed, 1); + /* Add burst allowance for the first write after device wrap */ + uint64_t total_budget = daily_budget + dev->l2ad_dwpd_accumulated + + L2ARC_BURST_SIZE_MAX; + + if (dev->l2ad_dwpd_writes >= total_budget) + return (0); + + return ((total_budget - dev->l2ad_dwpd_writes) / remaining_secs); +} + +/* + * Get write rate based on device state and DWPD configuration. + */ +static uint64_t +l2arc_get_write_rate(l2arc_dev_t *dev) +{ + uint64_t write_max = l2arc_write_max; + spa_t *spa = dev->l2ad_spa; + + /* + * Make sure l2arc_write_max is valid in case user altered it. + */ + if (write_max == 0) { + cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " + "resetting it to the default (%d)", L2ARC_WRITE_SIZE); + write_max = l2arc_write_max = L2ARC_WRITE_SIZE; + } + + /* Apply DWPD rate limit for persistent marker configurations */ + if (!dev->l2ad_first && l2arc_dwpd_limit > 0 && + spa->spa_l2arc_info.l2arc_total_capacity >= + L2ARC_PERSIST_THRESHOLD) { + uint64_t dwpd_rate = l2arc_dwpd_rate_limit(dev); + return (MIN(dwpd_rate, write_max)); + } + + return (write_max); +} + /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. @@ -9379,6 +9446,14 @@ out: dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; + /* + * Reset DWPD counters - first pass writes are free, start + * fresh 24h budget period now that device is full. + */ + dev->l2ad_dwpd_writes = 0; + dev->l2ad_dwpd_start = gethrestime_sec(); + dev->l2ad_dwpd_accumulated = 0; + dev->l2ad_dwpd_bump = l2arc_dwpd_bump; goto top; } @@ -9852,9 +9927,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * vs ARC size. Use persistent markers for pools with significant * L2ARC investment, otherwise use simple HEAD/TAIL scanning. */ - uint64_t threshold = MIN((arc_c_max / 4), arc_c); boolean_t save_position = - (spa->spa_l2arc_info.l2arc_total_capacity >= threshold); + (spa->spa_l2arc_info.l2arc_total_capacity >= + L2ARC_PERSIST_THRESHOLD); /* * Check if markers need reset based on smallest device threshold. @@ -9988,6 +10063,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) spa->spa_l2arc_info.l2arc_total_writes += write_asize; mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock); + /* Track writes for DWPD rate limiting */ + dev->l2ad_dwpd_writes += write_asize; + /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block @@ -10070,7 +10148,8 @@ l2arc_feed_thread(void *arg) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(dev); + clock_t interval; + size = l2arc_write_size(dev, &interval); /* * Evict L2ARC buffers that will be overwritten. @@ -10083,9 +10162,18 @@ l2arc_feed_thread(void *arg) wrote = l2arc_write_buffers(spa, dev, size); /* - * Calculate interval between writes. + * Adjust interval based on actual write. */ - next = l2arc_write_interval(begin, size, wrote); + if (wrote == 0) + interval = hz * l2arc_feed_secs; + else if (wrote < size) + interval = (interval * wrote) / size; + + /* + * Calculate next feed time. + */ + clock_t now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, begin + interval)); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); @@ -10255,6 +10343,10 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; + adddev->l2ad_dwpd_writes = 0; + adddev->l2ad_dwpd_start = gethrestime_sec(); + adddev->l2ad_dwpd_accumulated = 0; + adddev->l2ad_dwpd_bump = l2arc_dwpd_bump; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); @@ -11536,8 +11628,9 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, - "Extra write bytes during device warmup"); +ZFS_MODULE_PARAM_CALL(zfs_l2arc, l2arc_, dwpd_limit, param_set_l2arc_dwpd_limit, + spl_param_get_u64, ZMOD_RW, + "L2ARC device endurance limit as percentage (100 = 1.0 DWPD)"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache");