mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-03-11 04:46:18 +03:00
L2ARC: Implement persistent markers with consistent tail scanning
This commit introduces per-sublist persistent markers that eliminate redundant tail scanning between L2ARC iterations, providing significant CPU efficiency improvements. Markers are pre-allocated during device initialization and properly cleaned up during device removal. The implementation uses conditional behavior based on device capacity: small devices (capacity < arc_c) retain original HEAD/TAIL scanning based on ARC warmup state, while large devices (capacity >= arc_c) use the persistent marker approach for optimal CPU efficiency. Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #18093
This commit is contained in:
parent
3523b5f3f9
commit
2f41b9d865
@ -41,6 +41,16 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* L2ARC state and statistics for persistent marker management.
|
||||
*/
|
||||
typedef struct l2arc_info {
|
||||
arc_buf_hdr_t ***l2arc_markers; /* persistent markers */
|
||||
uint64_t l2arc_total_writes; /* total writes for reset */
|
||||
uint64_t l2arc_total_capacity; /* total L2ARC capacity */
|
||||
uint64_t l2arc_smallest_capacity; /* smallest device capacity */
|
||||
} l2arc_info_t;
|
||||
|
||||
/*
|
||||
* Note that buffers can be in one of 6 states:
|
||||
* ARC_anon - anonymous (discussed below)
|
||||
|
||||
@ -52,6 +52,7 @@
|
||||
#include <sys/dsl_crypt.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/zthr.h>
|
||||
#include <sys/arc_impl.h>
|
||||
#include <sys/dsl_deadlist.h>
|
||||
#include <zfeature_common.h>
|
||||
|
||||
@ -283,6 +284,7 @@ struct spa {
|
||||
spa_aux_vdev_t spa_spares; /* hot spares */
|
||||
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
|
||||
boolean_t spa_aux_sync_uber; /* need to sync aux uber */
|
||||
l2arc_info_t spa_l2arc_info; /* L2ARC state and stats */
|
||||
nvlist_t *spa_label_features; /* Features for reading MOS */
|
||||
uint64_t spa_config_object; /* MOS object for pool config */
|
||||
uint64_t spa_config_generation; /* config generation number */
|
||||
|
||||
334
module/zfs/arc.c
334
module/zfs/arc.c
@ -9084,6 +9084,98 @@ l2arc_sublist_lock(int list_num, int sublist_idx)
|
||||
return (multilist_sublist_lock_idx(ml, sublist_idx));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a pool has any L2ARC devices.
|
||||
*/
|
||||
static boolean_t
|
||||
l2arc_pool_has_devices(spa_t *target_spa)
|
||||
{
|
||||
l2arc_dev_t *dev;
|
||||
|
||||
ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
|
||||
|
||||
for (dev = list_head(l2arc_dev_list); dev != NULL;
|
||||
dev = list_next(l2arc_dev_list, dev)) {
|
||||
if (dev->l2ad_spa == target_spa) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize pool-based markers for l2arc position saving.
|
||||
*/
|
||||
static void
|
||||
l2arc_pool_markers_init(spa_t *spa)
|
||||
{
|
||||
ASSERT(spa->spa_l2arc_info.l2arc_markers == NULL);
|
||||
|
||||
spa->spa_l2arc_info.l2arc_markers = kmem_zalloc(L2ARC_FEED_TYPES *
|
||||
sizeof (arc_buf_hdr_t **), KM_SLEEP);
|
||||
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
multilist_t *ml = l2arc_get_list(pass);
|
||||
if (ml == NULL)
|
||||
continue;
|
||||
|
||||
int num_sublists = multilist_get_num_sublists(ml);
|
||||
|
||||
spa->spa_l2arc_info.l2arc_markers[pass] =
|
||||
arc_state_alloc_markers(num_sublists);
|
||||
|
||||
for (int i = 0; i < num_sublists; i++) {
|
||||
multilist_sublist_t *mls =
|
||||
multilist_sublist_lock_idx(ml, i);
|
||||
multilist_sublist_insert_tail(mls,
|
||||
spa->spa_l2arc_info.l2arc_markers[pass][i]);
|
||||
multilist_sublist_unlock(mls);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Free all allocated pool-based markers.
|
||||
*/
|
||||
static void
|
||||
l2arc_pool_markers_fini(spa_t *spa)
|
||||
{
|
||||
ASSERT(spa->spa_l2arc_info.l2arc_markers != NULL);
|
||||
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
|
||||
continue;
|
||||
|
||||
multilist_t *ml = l2arc_get_list(pass);
|
||||
if (ml == NULL)
|
||||
continue;
|
||||
|
||||
int num_sublists = multilist_get_num_sublists(ml);
|
||||
|
||||
for (int i = 0; i < num_sublists; i++) {
|
||||
ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
|
||||
!=, NULL);
|
||||
multilist_sublist_t *mls =
|
||||
multilist_sublist_lock_idx(ml, i);
|
||||
ASSERT(multilist_link_active(
|
||||
&spa->spa_l2arc_info.l2arc_markers[pass][i]->
|
||||
b_l1hdr.b_arc_node));
|
||||
multilist_sublist_remove(mls,
|
||||
spa->spa_l2arc_info.l2arc_markers[pass][i]);
|
||||
multilist_sublist_unlock(mls);
|
||||
}
|
||||
|
||||
arc_state_free_markers(spa->spa_l2arc_info.l2arc_markers[pass],
|
||||
num_sublists);
|
||||
spa->spa_l2arc_info.l2arc_markers[pass] = NULL;
|
||||
}
|
||||
|
||||
kmem_free(spa->spa_l2arc_info.l2arc_markers, L2ARC_FEED_TYPES *
|
||||
sizeof (arc_buf_hdr_t **));
|
||||
spa->spa_l2arc_info.l2arc_markers = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculates the maximum overhead of L2ARC metadata log blocks for a given
|
||||
* L2ARC write size. l2arc_evict and l2arc_write_size need to include this
|
||||
@ -9454,37 +9546,63 @@ error:
|
||||
}
|
||||
|
||||
/*
|
||||
* Process a single sublist for L2ARC writing.
|
||||
* Write buffers from a single sublist to L2ARC.
|
||||
* Handles locking, marker determination, and buffer processing.
|
||||
* Returns B_TRUE if target size reached, B_FALSE otherwise.
|
||||
*/
|
||||
static boolean_t
|
||||
l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, multilist_sublist_t *mls,
|
||||
arc_buf_hdr_t *marker, boolean_t from_head, uint64_t target_sz,
|
||||
uint64_t *write_asize, uint64_t *write_psize, zio_t **pio,
|
||||
l2arc_write_callback_t **cb, arc_buf_hdr_t *head, uint64_t *consumed,
|
||||
uint64_t sublist_headroom, uint64_t guid)
|
||||
l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, int pass, int sublist_idx,
|
||||
uint64_t target_sz, uint64_t *write_asize, uint64_t *write_psize,
|
||||
zio_t **pio, l2arc_write_callback_t **cb, arc_buf_hdr_t *head,
|
||||
uint64_t *consumed, uint64_t sublist_headroom, boolean_t save_position)
|
||||
{
|
||||
arc_buf_hdr_t *hdr;
|
||||
multilist_sublist_t *mls;
|
||||
arc_buf_hdr_t *hdr, *prev_hdr;
|
||||
arc_buf_hdr_t *persistent_marker, *local_marker;
|
||||
boolean_t full = B_FALSE;
|
||||
boolean_t scan_from_head = B_FALSE;
|
||||
uint64_t guid = spa_load_guid(spa);
|
||||
|
||||
/*
|
||||
* Until the ARC is warm and starts to evict, read from the
|
||||
* head of the ARC lists rather than the tail.
|
||||
*/
|
||||
if (from_head)
|
||||
hdr = multilist_sublist_head(mls);
|
||||
else
|
||||
hdr = multilist_sublist_tail(mls);
|
||||
mls = l2arc_sublist_lock(pass, sublist_idx);
|
||||
ASSERT3P(mls, !=, NULL);
|
||||
|
||||
persistent_marker = spa->spa_l2arc_info.
|
||||
l2arc_markers[pass][sublist_idx];
|
||||
|
||||
if (save_position && persistent_marker == multilist_sublist_head(mls)) {
|
||||
multilist_sublist_unlock(mls);
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
local_marker = arc_state_alloc_marker();
|
||||
|
||||
if (save_position) {
|
||||
hdr = multilist_sublist_prev(mls, persistent_marker);
|
||||
ASSERT3P(hdr, !=, NULL);
|
||||
scan_from_head = B_FALSE;
|
||||
} else {
|
||||
if (arc_warm) {
|
||||
hdr = multilist_sublist_tail(mls);
|
||||
scan_from_head = B_FALSE;
|
||||
} else {
|
||||
hdr = multilist_sublist_head(mls);
|
||||
scan_from_head = B_TRUE;
|
||||
}
|
||||
ASSERT3P(hdr, !=, NULL);
|
||||
}
|
||||
|
||||
prev_hdr = hdr;
|
||||
|
||||
while (hdr != NULL) {
|
||||
kmutex_t *hash_lock;
|
||||
abd_t *to_write = NULL;
|
||||
prev_hdr = hdr;
|
||||
|
||||
hash_lock = HDR_LOCK(hdr);
|
||||
if (!mutex_tryenter(hash_lock)) {
|
||||
skip:
|
||||
/* Skip this buffer rather than waiting. */
|
||||
if (from_head)
|
||||
if (scan_from_head)
|
||||
hdr = multilist_sublist_next(mls, hdr);
|
||||
else
|
||||
hdr = multilist_sublist_prev(mls, hdr);
|
||||
@ -9532,11 +9650,10 @@ skip:
|
||||
* may block ARC eviction. Insert a marker to save
|
||||
* the position and drop the lock.
|
||||
*/
|
||||
if (from_head) {
|
||||
multilist_sublist_insert_after(mls, hdr, marker);
|
||||
} else {
|
||||
multilist_sublist_insert_before(mls, hdr, marker);
|
||||
}
|
||||
if (scan_from_head)
|
||||
multilist_sublist_insert_after(mls, hdr, local_marker);
|
||||
else
|
||||
multilist_sublist_insert_before(mls, hdr, local_marker);
|
||||
multilist_sublist_unlock(mls);
|
||||
|
||||
/*
|
||||
@ -9637,13 +9754,46 @@ skip:
|
||||
|
||||
next:
|
||||
multilist_sublist_lock(mls);
|
||||
if (from_head)
|
||||
hdr = multilist_sublist_next(mls, marker);
|
||||
if (scan_from_head)
|
||||
hdr = multilist_sublist_next(mls, local_marker);
|
||||
else
|
||||
hdr = multilist_sublist_prev(mls, marker);
|
||||
multilist_sublist_remove(mls, marker);
|
||||
hdr = multilist_sublist_prev(mls, local_marker);
|
||||
multilist_sublist_remove(mls, local_marker);
|
||||
}
|
||||
|
||||
/*
|
||||
* Position persistent marker for next iteration. In case of
|
||||
* save_position, validate that prev_hdr still belongs to the current
|
||||
* sublist. The sublist lock is dropped during L2ARC write I/O, allowing
|
||||
* ARC eviction to potentially free prev_hdr. If freed, we can't do much
|
||||
* except to reset the marker.
|
||||
*/
|
||||
multilist_sublist_remove(mls, persistent_marker);
|
||||
if (save_position &&
|
||||
multilist_link_active(&prev_hdr->b_l1hdr.b_arc_node)) {
|
||||
if (hdr != NULL) {
|
||||
/*
|
||||
* Break: prev_hdr not written, retry next time.
|
||||
* Scan is TAIL->HEAD, so insert_after = retry.
|
||||
*/
|
||||
multilist_sublist_insert_after(mls, prev_hdr,
|
||||
persistent_marker);
|
||||
} else {
|
||||
/*
|
||||
* List end: prev_hdr processed, move on.
|
||||
* insert_before = skip prev_hdr next scan.
|
||||
*/
|
||||
multilist_sublist_insert_before(mls, prev_hdr,
|
||||
persistent_marker);
|
||||
}
|
||||
} else {
|
||||
multilist_sublist_insert_tail(mls, persistent_marker);
|
||||
}
|
||||
|
||||
multilist_sublist_unlock(mls);
|
||||
|
||||
arc_state_free_marker(local_marker);
|
||||
|
||||
return (full);
|
||||
}
|
||||
|
||||
@ -9658,6 +9808,45 @@ l2arc_blk_fetch_done(zio_t *zio)
|
||||
kmem_free(cb, sizeof (l2arc_read_callback_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset all L2ARC markers to tail position for the given spa.
|
||||
*/
|
||||
static void
|
||||
l2arc_reset_all_markers(spa_t *spa)
|
||||
{
|
||||
ASSERT(spa->spa_l2arc_info.l2arc_markers != NULL);
|
||||
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
|
||||
continue;
|
||||
|
||||
multilist_t *ml = l2arc_get_list(pass);
|
||||
int num_sublists = multilist_get_num_sublists(ml);
|
||||
|
||||
for (int i = 0; i < num_sublists; i++) {
|
||||
ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
|
||||
!=, NULL);
|
||||
multilist_sublist_t *mls =
|
||||
multilist_sublist_lock_idx(ml, i);
|
||||
|
||||
/* Remove from current position */
|
||||
ASSERT(multilist_link_active(&spa->spa_l2arc_info.
|
||||
l2arc_markers[pass][i]->b_l1hdr.b_arc_node));
|
||||
multilist_sublist_remove(mls, spa->spa_l2arc_info.
|
||||
l2arc_markers[pass][i]);
|
||||
|
||||
/* Insert at tail (like initialization) */
|
||||
multilist_sublist_insert_tail(mls,
|
||||
spa->spa_l2arc_info.l2arc_markers[pass][i]);
|
||||
|
||||
multilist_sublist_unlock(mls);
|
||||
}
|
||||
}
|
||||
|
||||
/* Reset write counter */
|
||||
spa->spa_l2arc_info.l2arc_total_writes = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and write ARC buffers to the L2ARC device.
|
||||
*
|
||||
@ -9673,12 +9862,11 @@ l2arc_blk_fetch_done(zio_t *zio)
|
||||
static uint64_t
|
||||
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
{
|
||||
arc_buf_hdr_t *head, *marker;
|
||||
arc_buf_hdr_t *head;
|
||||
uint64_t write_asize, write_psize, headroom;
|
||||
boolean_t full, from_head = !arc_warm;
|
||||
boolean_t full;
|
||||
l2arc_write_callback_t *cb = NULL;
|
||||
zio_t *pio;
|
||||
uint64_t guid = spa_load_guid(spa);
|
||||
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
|
||||
|
||||
ASSERT3P(dev->l2ad_vdev, !=, NULL);
|
||||
@ -9688,7 +9876,23 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
full = B_FALSE;
|
||||
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
||||
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
|
||||
marker = arc_state_alloc_marker();
|
||||
|
||||
/*
|
||||
* Determine L2ARC implementation based on total pool L2ARC capacity
|
||||
* vs ARC size. Use persistent markers for pools with significant
|
||||
* L2ARC investment, otherwise use simple HEAD/TAIL scanning.
|
||||
*/
|
||||
uint64_t threshold = MIN((arc_c_max / 4), arc_c);
|
||||
boolean_t save_position =
|
||||
(spa->spa_l2arc_info.l2arc_total_capacity >= threshold);
|
||||
|
||||
/*
|
||||
* Check if markers need reset based on smallest device threshold.
|
||||
* Reset when cumulative writes exceed 1/8th of smallest device.
|
||||
*/
|
||||
if (save_position && spa->spa_l2arc_info.l2arc_total_writes >=
|
||||
spa->spa_l2arc_info.l2arc_smallest_capacity / 8)
|
||||
l2arc_reset_all_markers(spa);
|
||||
|
||||
/*
|
||||
* Copy buffers for L2ARC writing.
|
||||
@ -9721,7 +9925,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
int processed_sublists = 0;
|
||||
while (processed_sublists < num_sublists && !full) {
|
||||
uint64_t sublist_headroom;
|
||||
multilist_sublist_t *mls;
|
||||
|
||||
if (consumed_headroom >= headroom)
|
||||
break;
|
||||
@ -9731,15 +9934,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
|
||||
if (sublist_headroom == 0)
|
||||
break;
|
||||
mls = l2arc_sublist_lock(pass, current_sublist);
|
||||
ASSERT3P(mls, !=, NULL);
|
||||
|
||||
full = l2arc_write_sublist(spa, dev, mls, marker,
|
||||
from_head, target_sz, &write_asize, &write_psize,
|
||||
&pio, &cb, head, &consumed_headroom,
|
||||
sublist_headroom, guid);
|
||||
/*
|
||||
* Write buffers from this sublist to L2ARC.
|
||||
* Function handles locking, marker management, and
|
||||
* buffer processing internally.
|
||||
*/
|
||||
full = l2arc_write_sublist(spa, dev, pass,
|
||||
current_sublist, target_sz, &write_asize,
|
||||
&write_psize, &pio, &cb, head,
|
||||
&consumed_headroom, sublist_headroom,
|
||||
save_position);
|
||||
|
||||
multilist_sublist_unlock(mls);
|
||||
current_sublist = (current_sublist + 1) % num_sublists;
|
||||
processed_sublists++;
|
||||
}
|
||||
@ -9748,8 +9954,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
break;
|
||||
}
|
||||
|
||||
arc_state_free_marker(marker);
|
||||
|
||||
/* No buffers selected for writing? */
|
||||
if (pio == NULL) {
|
||||
ASSERT0(write_psize);
|
||||
@ -9777,6 +9981,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
(void) zio_wait(pio);
|
||||
dev->l2ad_writing = B_FALSE;
|
||||
|
||||
/*
|
||||
* Update cumulative write tracking for marker reset logic.
|
||||
*/
|
||||
spa->spa_l2arc_info.l2arc_total_writes += write_asize;
|
||||
|
||||
/*
|
||||
* Update the device header after the zio completes as
|
||||
* l2arc_write_done() may have updated the memory holding the log block
|
||||
@ -10003,6 +10212,30 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Recalculate smallest L2ARC device capacity for the given spa.
|
||||
* Must be called under l2arc_dev_mtx.
|
||||
*/
|
||||
static void
|
||||
l2arc_update_smallest_capacity(spa_t *spa)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
|
||||
l2arc_dev_t *dev;
|
||||
uint64_t smallest = UINT64_MAX;
|
||||
|
||||
for (dev = list_head(l2arc_dev_list); dev != NULL;
|
||||
dev = list_next(l2arc_dev_list, dev)) {
|
||||
if (dev->l2ad_spa == spa) {
|
||||
uint64_t cap = dev->l2ad_end - dev->l2ad_start;
|
||||
if (cap < smallest)
|
||||
smallest = cap;
|
||||
}
|
||||
}
|
||||
|
||||
spa->spa_l2arc_info.l2arc_smallest_capacity = smallest;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a vdev for use by the L2ARC. By this point the spa has already
|
||||
* validated the vdev and opened it.
|
||||
@ -10068,8 +10301,20 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
|
||||
* Add device to global list
|
||||
*/
|
||||
mutex_enter(&l2arc_dev_mtx);
|
||||
|
||||
/*
|
||||
* Initialize pool-based position saving markers if this is the first
|
||||
* L2ARC device for this pool
|
||||
*/
|
||||
if (!l2arc_pool_has_devices(spa)) {
|
||||
l2arc_pool_markers_init(spa);
|
||||
}
|
||||
|
||||
list_insert_head(l2arc_dev_list, adddev);
|
||||
atomic_inc_64(&l2arc_ndev);
|
||||
spa->spa_l2arc_info.l2arc_total_capacity += (adddev->l2ad_end -
|
||||
adddev->l2ad_start);
|
||||
l2arc_update_smallest_capacity(spa);
|
||||
mutex_exit(&l2arc_dev_mtx);
|
||||
}
|
||||
|
||||
@ -10190,6 +10435,17 @@ l2arc_remove_vdev(vdev_t *vd)
|
||||
list_remove(l2arc_dev_list, remdev);
|
||||
l2arc_dev_last = NULL; /* may have been invalidated */
|
||||
atomic_dec_64(&l2arc_ndev);
|
||||
spa->spa_l2arc_info.l2arc_total_capacity -=
|
||||
(remdev->l2ad_end - remdev->l2ad_start);
|
||||
l2arc_update_smallest_capacity(spa);
|
||||
|
||||
/*
|
||||
* Clean up pool-based markers if this was the last L2ARC device
|
||||
* for this pool
|
||||
*/
|
||||
if (!l2arc_pool_has_devices(spa)) {
|
||||
l2arc_pool_markers_fini(spa);
|
||||
}
|
||||
|
||||
/* During a pool export spa & vdev will no longer be valid */
|
||||
if (asynchronous) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user