L2ARC: Implement persistent markers with consistent tail scanning

This commit introduces per-sublist persistent markers that eliminate
redundant tail scanning between L2ARC iterations, providing significant
CPU efficiency improvements. Markers are pre-allocated during device
initialization and properly cleaned up during device removal.

The implementation uses conditional behavior based on device capacity:
small devices (capacity < arc_c) retain original HEAD/TAIL scanning
based on ARC warmup state, while large devices (capacity >= arc_c)
use the persistent marker approach for optimal CPU efficiency.

Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #18093
This commit is contained in:
Ameer Hamza 2025-07-06 03:41:54 +05:00 committed by Brian Behlendorf
parent 3523b5f3f9
commit 2f41b9d865
3 changed files with 307 additions and 39 deletions

View File

@ -41,6 +41,16 @@
extern "C" {
#endif
/*
* L2ARC state and statistics for persistent marker management.
*/
typedef struct l2arc_info {
arc_buf_hdr_t ***l2arc_markers; /* persistent markers */
uint64_t l2arc_total_writes; /* total writes for reset */
uint64_t l2arc_total_capacity; /* total L2ARC capacity */
uint64_t l2arc_smallest_capacity; /* smallest device capacity */
} l2arc_info_t;
/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)

View File

@ -52,6 +52,7 @@
#include <sys/dsl_crypt.h>
#include <sys/zfeature.h>
#include <sys/zthr.h>
#include <sys/arc_impl.h>
#include <sys/dsl_deadlist.h>
#include <zfeature_common.h>
@ -283,6 +284,7 @@ struct spa {
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
boolean_t spa_aux_sync_uber; /* need to sync aux uber */
l2arc_info_t spa_l2arc_info; /* L2ARC state and stats */
nvlist_t *spa_label_features; /* Features for reading MOS */
uint64_t spa_config_object; /* MOS object for pool config */
uint64_t spa_config_generation; /* config generation number */

View File

@ -9084,6 +9084,98 @@ l2arc_sublist_lock(int list_num, int sublist_idx)
return (multilist_sublist_lock_idx(ml, sublist_idx));
}
/*
* Check if a pool has any L2ARC devices.
*/
static boolean_t
l2arc_pool_has_devices(spa_t *target_spa)
{
l2arc_dev_t *dev;
ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
for (dev = list_head(l2arc_dev_list); dev != NULL;
dev = list_next(l2arc_dev_list, dev)) {
if (dev->l2ad_spa == target_spa) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* Initialize pool-based markers for l2arc position saving.
*/
static void
l2arc_pool_markers_init(spa_t *spa)
{
ASSERT(spa->spa_l2arc_info.l2arc_markers == NULL);
spa->spa_l2arc_info.l2arc_markers = kmem_zalloc(L2ARC_FEED_TYPES *
sizeof (arc_buf_hdr_t **), KM_SLEEP);
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
multilist_t *ml = l2arc_get_list(pass);
if (ml == NULL)
continue;
int num_sublists = multilist_get_num_sublists(ml);
spa->spa_l2arc_info.l2arc_markers[pass] =
arc_state_alloc_markers(num_sublists);
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls =
multilist_sublist_lock_idx(ml, i);
multilist_sublist_insert_tail(mls,
spa->spa_l2arc_info.l2arc_markers[pass][i]);
multilist_sublist_unlock(mls);
}
}
}
/*
* Free all allocated pool-based markers.
*/
static void
l2arc_pool_markers_fini(spa_t *spa)
{
ASSERT(spa->spa_l2arc_info.l2arc_markers != NULL);
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
continue;
multilist_t *ml = l2arc_get_list(pass);
if (ml == NULL)
continue;
int num_sublists = multilist_get_num_sublists(ml);
for (int i = 0; i < num_sublists; i++) {
ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
!=, NULL);
multilist_sublist_t *mls =
multilist_sublist_lock_idx(ml, i);
ASSERT(multilist_link_active(
&spa->spa_l2arc_info.l2arc_markers[pass][i]->
b_l1hdr.b_arc_node));
multilist_sublist_remove(mls,
spa->spa_l2arc_info.l2arc_markers[pass][i]);
multilist_sublist_unlock(mls);
}
arc_state_free_markers(spa->spa_l2arc_info.l2arc_markers[pass],
num_sublists);
spa->spa_l2arc_info.l2arc_markers[pass] = NULL;
}
kmem_free(spa->spa_l2arc_info.l2arc_markers, L2ARC_FEED_TYPES *
sizeof (arc_buf_hdr_t **));
spa->spa_l2arc_info.l2arc_markers = NULL;
}
/*
* Calculates the maximum overhead of L2ARC metadata log blocks for a given
* L2ARC write size. l2arc_evict and l2arc_write_size need to include this
@ -9454,37 +9546,63 @@ error:
}
/*
* Process a single sublist for L2ARC writing.
* Write buffers from a single sublist to L2ARC.
* Handles locking, marker determination, and buffer processing.
* Returns B_TRUE if target size reached, B_FALSE otherwise.
*/
static boolean_t
l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, multilist_sublist_t *mls,
arc_buf_hdr_t *marker, boolean_t from_head, uint64_t target_sz,
uint64_t *write_asize, uint64_t *write_psize, zio_t **pio,
l2arc_write_callback_t **cb, arc_buf_hdr_t *head, uint64_t *consumed,
uint64_t sublist_headroom, uint64_t guid)
l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, int pass, int sublist_idx,
uint64_t target_sz, uint64_t *write_asize, uint64_t *write_psize,
zio_t **pio, l2arc_write_callback_t **cb, arc_buf_hdr_t *head,
uint64_t *consumed, uint64_t sublist_headroom, boolean_t save_position)
{
arc_buf_hdr_t *hdr;
multilist_sublist_t *mls;
arc_buf_hdr_t *hdr, *prev_hdr;
arc_buf_hdr_t *persistent_marker, *local_marker;
boolean_t full = B_FALSE;
boolean_t scan_from_head = B_FALSE;
uint64_t guid = spa_load_guid(spa);
/*
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
if (from_head)
hdr = multilist_sublist_head(mls);
else
hdr = multilist_sublist_tail(mls);
mls = l2arc_sublist_lock(pass, sublist_idx);
ASSERT3P(mls, !=, NULL);
persistent_marker = spa->spa_l2arc_info.
l2arc_markers[pass][sublist_idx];
if (save_position && persistent_marker == multilist_sublist_head(mls)) {
multilist_sublist_unlock(mls);
return (B_FALSE);
}
local_marker = arc_state_alloc_marker();
if (save_position) {
hdr = multilist_sublist_prev(mls, persistent_marker);
ASSERT3P(hdr, !=, NULL);
scan_from_head = B_FALSE;
} else {
if (arc_warm) {
hdr = multilist_sublist_tail(mls);
scan_from_head = B_FALSE;
} else {
hdr = multilist_sublist_head(mls);
scan_from_head = B_TRUE;
}
ASSERT3P(hdr, !=, NULL);
}
prev_hdr = hdr;
while (hdr != NULL) {
kmutex_t *hash_lock;
abd_t *to_write = NULL;
prev_hdr = hdr;
hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
skip:
/* Skip this buffer rather than waiting. */
if (from_head)
if (scan_from_head)
hdr = multilist_sublist_next(mls, hdr);
else
hdr = multilist_sublist_prev(mls, hdr);
@ -9532,11 +9650,10 @@ skip:
* may block ARC eviction. Insert a marker to save
* the position and drop the lock.
*/
if (from_head) {
multilist_sublist_insert_after(mls, hdr, marker);
} else {
multilist_sublist_insert_before(mls, hdr, marker);
}
if (scan_from_head)
multilist_sublist_insert_after(mls, hdr, local_marker);
else
multilist_sublist_insert_before(mls, hdr, local_marker);
multilist_sublist_unlock(mls);
/*
@ -9637,13 +9754,46 @@ skip:
next:
multilist_sublist_lock(mls);
if (from_head)
hdr = multilist_sublist_next(mls, marker);
if (scan_from_head)
hdr = multilist_sublist_next(mls, local_marker);
else
hdr = multilist_sublist_prev(mls, marker);
multilist_sublist_remove(mls, marker);
hdr = multilist_sublist_prev(mls, local_marker);
multilist_sublist_remove(mls, local_marker);
}
/*
* Position persistent marker for next iteration. In case of
* save_position, validate that prev_hdr still belongs to the current
* sublist. The sublist lock is dropped during L2ARC write I/O, allowing
* ARC eviction to potentially free prev_hdr. If freed, we can't do much
* except to reset the marker.
*/
multilist_sublist_remove(mls, persistent_marker);
if (save_position &&
multilist_link_active(&prev_hdr->b_l1hdr.b_arc_node)) {
if (hdr != NULL) {
/*
* Break: prev_hdr not written, retry next time.
* Scan is TAIL->HEAD, so insert_after = retry.
*/
multilist_sublist_insert_after(mls, prev_hdr,
persistent_marker);
} else {
/*
* List end: prev_hdr processed, move on.
* insert_before = skip prev_hdr next scan.
*/
multilist_sublist_insert_before(mls, prev_hdr,
persistent_marker);
}
} else {
multilist_sublist_insert_tail(mls, persistent_marker);
}
multilist_sublist_unlock(mls);
arc_state_free_marker(local_marker);
return (full);
}
@ -9658,6 +9808,45 @@ l2arc_blk_fetch_done(zio_t *zio)
kmem_free(cb, sizeof (l2arc_read_callback_t));
}
/*
* Reset all L2ARC markers to tail position for the given spa.
*/
static void
l2arc_reset_all_markers(spa_t *spa)
{
ASSERT(spa->spa_l2arc_info.l2arc_markers != NULL);
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
continue;
multilist_t *ml = l2arc_get_list(pass);
int num_sublists = multilist_get_num_sublists(ml);
for (int i = 0; i < num_sublists; i++) {
ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
!=, NULL);
multilist_sublist_t *mls =
multilist_sublist_lock_idx(ml, i);
/* Remove from current position */
ASSERT(multilist_link_active(&spa->spa_l2arc_info.
l2arc_markers[pass][i]->b_l1hdr.b_arc_node));
multilist_sublist_remove(mls, spa->spa_l2arc_info.
l2arc_markers[pass][i]);
/* Insert at tail (like initialization) */
multilist_sublist_insert_tail(mls,
spa->spa_l2arc_info.l2arc_markers[pass][i]);
multilist_sublist_unlock(mls);
}
}
/* Reset write counter */
spa->spa_l2arc_info.l2arc_total_writes = 0;
}
/*
* Find and write ARC buffers to the L2ARC device.
*
@ -9673,12 +9862,11 @@ l2arc_blk_fetch_done(zio_t *zio)
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
arc_buf_hdr_t *head, *marker;
arc_buf_hdr_t *head;
uint64_t write_asize, write_psize, headroom;
boolean_t full, from_head = !arc_warm;
boolean_t full;
l2arc_write_callback_t *cb = NULL;
zio_t *pio;
uint64_t guid = spa_load_guid(spa);
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
ASSERT3P(dev->l2ad_vdev, !=, NULL);
@ -9688,7 +9876,23 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
full = B_FALSE;
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
marker = arc_state_alloc_marker();
/*
* Determine L2ARC implementation based on total pool L2ARC capacity
* vs ARC size. Use persistent markers for pools with significant
* L2ARC investment, otherwise use simple HEAD/TAIL scanning.
*/
uint64_t threshold = MIN((arc_c_max / 4), arc_c);
boolean_t save_position =
(spa->spa_l2arc_info.l2arc_total_capacity >= threshold);
/*
* Check if markers need reset based on smallest device threshold.
* Reset when cumulative writes exceed 1/8th of smallest device.
*/
if (save_position && spa->spa_l2arc_info.l2arc_total_writes >=
spa->spa_l2arc_info.l2arc_smallest_capacity / 8)
l2arc_reset_all_markers(spa);
/*
* Copy buffers for L2ARC writing.
@ -9721,7 +9925,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
int processed_sublists = 0;
while (processed_sublists < num_sublists && !full) {
uint64_t sublist_headroom;
multilist_sublist_t *mls;
if (consumed_headroom >= headroom)
break;
@ -9731,15 +9934,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
if (sublist_headroom == 0)
break;
mls = l2arc_sublist_lock(pass, current_sublist);
ASSERT3P(mls, !=, NULL);
full = l2arc_write_sublist(spa, dev, mls, marker,
from_head, target_sz, &write_asize, &write_psize,
&pio, &cb, head, &consumed_headroom,
sublist_headroom, guid);
/*
* Write buffers from this sublist to L2ARC.
* Function handles locking, marker management, and
* buffer processing internally.
*/
full = l2arc_write_sublist(spa, dev, pass,
current_sublist, target_sz, &write_asize,
&write_psize, &pio, &cb, head,
&consumed_headroom, sublist_headroom,
save_position);
multilist_sublist_unlock(mls);
current_sublist = (current_sublist + 1) % num_sublists;
processed_sublists++;
}
@ -9748,8 +9954,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}
arc_state_free_marker(marker);
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_psize);
@ -9777,6 +9981,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
/*
* Update cumulative write tracking for marker reset logic.
*/
spa->spa_l2arc_info.l2arc_total_writes += write_asize;
/*
* Update the device header after the zio completes as
* l2arc_write_done() may have updated the memory holding the log block
@ -10003,6 +10212,30 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
}
}
/*
* Recalculate smallest L2ARC device capacity for the given spa.
* Must be called under l2arc_dev_mtx.
*/
static void
l2arc_update_smallest_capacity(spa_t *spa)
{
ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
l2arc_dev_t *dev;
uint64_t smallest = UINT64_MAX;
for (dev = list_head(l2arc_dev_list); dev != NULL;
dev = list_next(l2arc_dev_list, dev)) {
if (dev->l2ad_spa == spa) {
uint64_t cap = dev->l2ad_end - dev->l2ad_start;
if (cap < smallest)
smallest = cap;
}
}
spa->spa_l2arc_info.l2arc_smallest_capacity = smallest;
}
/*
* Add a vdev for use by the L2ARC. By this point the spa has already
* validated the vdev and opened it.
@ -10068,8 +10301,20 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
* Add device to global list
*/
mutex_enter(&l2arc_dev_mtx);
/*
* Initialize pool-based position saving markers if this is the first
* L2ARC device for this pool
*/
if (!l2arc_pool_has_devices(spa)) {
l2arc_pool_markers_init(spa);
}
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
spa->spa_l2arc_info.l2arc_total_capacity += (adddev->l2ad_end -
adddev->l2ad_start);
l2arc_update_smallest_capacity(spa);
mutex_exit(&l2arc_dev_mtx);
}
@ -10190,6 +10435,17 @@ l2arc_remove_vdev(vdev_t *vd)
list_remove(l2arc_dev_list, remdev);
l2arc_dev_last = NULL; /* may have been invalidated */
atomic_dec_64(&l2arc_ndev);
spa->spa_l2arc_info.l2arc_total_capacity -=
(remdev->l2ad_end - remdev->l2ad_start);
l2arc_update_smallest_capacity(spa);
/*
* Clean up pool-based markers if this was the last L2ARC device
* for this pool
*/
if (!l2arc_pool_has_devices(spa)) {
l2arc_pool_markers_fini(spa);
}
/* During a pool export spa & vdev will no longer be valid */
if (asynchronous) {