mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-04-17 08:54:52 +03:00
L2ARC: Implement even-depth multi-sublist scanning
The introduction of ARC multilists made L2ARC writing quite random, depending on whether it found something to write in a randomly selected sublist. This created inconsistent write patterns and poor utilization of available sublists leading to uneven cache population. This commit replaces random selection with systematic scanning across all sublists within each burst. Fair headroom distribution ensures even-depth traversal across all sublists until the target write size is reached. Round-robin processing with random starting points eliminates sequential bias while maintaining predictable write behavior. The systematic approach provides consistent L2ARC filling patterns and better utilization of available ARC data across all sublists. Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #18093
This commit is contained in:
parent
07ae463d1a
commit
3523b5f3f9
278
module/zfs/arc.c
278
module/zfs/arc.c
@ -9044,48 +9044,44 @@ l2arc_read_done(zio_t *zio)
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the list priority from which the L2ARC will search for pages to
|
||||
* cache. This is used within loops (0..3) to cycle through lists in the
|
||||
* desired order. This order can have a significant effect on cache
|
||||
* performance.
|
||||
* Get the multilist for the given list number (0..3) to cycle through
|
||||
* lists in the desired order. This order can have a significant effect
|
||||
* on cache performance.
|
||||
*
|
||||
* Currently the metadata lists are hit first, MFU then MRU, followed by
|
||||
* the data lists. This function returns a locked list, and also returns
|
||||
* the lock pointer.
|
||||
* the data lists.
|
||||
*/
|
||||
static multilist_sublist_t *
|
||||
l2arc_sublist_lock(int list_num)
|
||||
static multilist_t *
|
||||
l2arc_get_list(int list_num)
|
||||
{
|
||||
multilist_t *ml = NULL;
|
||||
unsigned int idx;
|
||||
|
||||
ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
|
||||
|
||||
switch (list_num) {
|
||||
case 0:
|
||||
ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
|
||||
break;
|
||||
return (&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
|
||||
case 1:
|
||||
ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
|
||||
break;
|
||||
return (&arc_mru->arcs_list[ARC_BUFC_METADATA]);
|
||||
case 2:
|
||||
ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
|
||||
break;
|
||||
return (&arc_mfu->arcs_list[ARC_BUFC_DATA]);
|
||||
case 3:
|
||||
ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
|
||||
break;
|
||||
return (&arc_mru->arcs_list[ARC_BUFC_DATA]);
|
||||
default:
|
||||
return (NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Return a randomly-selected sublist. This is acceptable
|
||||
* because the caller feeds only a little bit of data for each
|
||||
* call (8MB). Subsequent calls will result in different
|
||||
* sublists being selected.
|
||||
* Lock a specific sublist within the given list number.
|
||||
*/
|
||||
idx = multilist_get_random_index(ml);
|
||||
return (multilist_sublist_lock_idx(ml, idx));
|
||||
static multilist_sublist_t *
|
||||
l2arc_sublist_lock(int list_num, int sublist_idx)
|
||||
{
|
||||
multilist_t *ml = l2arc_get_list(list_num);
|
||||
if (ml == NULL)
|
||||
return (NULL);
|
||||
|
||||
return (multilist_sublist_lock_idx(ml, sublist_idx));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -9457,78 +9453,24 @@ error:
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static void
|
||||
l2arc_blk_fetch_done(zio_t *zio)
|
||||
/*
|
||||
* Process a single sublist for L2ARC writing.
|
||||
* Returns B_TRUE if target size reached, B_FALSE otherwise.
|
||||
*/
|
||||
static boolean_t
|
||||
l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, multilist_sublist_t *mls,
|
||||
arc_buf_hdr_t *marker, boolean_t from_head, uint64_t target_sz,
|
||||
uint64_t *write_asize, uint64_t *write_psize, zio_t **pio,
|
||||
l2arc_write_callback_t **cb, arc_buf_hdr_t *head, uint64_t *consumed,
|
||||
uint64_t sublist_headroom, uint64_t guid)
|
||||
{
|
||||
l2arc_read_callback_t *cb;
|
||||
|
||||
cb = zio->io_private;
|
||||
if (cb->l2rcb_abd != NULL)
|
||||
abd_free(cb->l2rcb_abd);
|
||||
kmem_free(cb, sizeof (l2arc_read_callback_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and write ARC buffers to the L2ARC device.
|
||||
*
|
||||
* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
|
||||
* for reading until they have completed writing.
|
||||
* The headroom_boost is an in-out parameter used to maintain headroom boost
|
||||
* state between calls to this function.
|
||||
*
|
||||
* Returns the number of bytes actually written (which may be smaller than
|
||||
* the delta by which the device hand has changed due to alignment and the
|
||||
* writing of log blocks).
|
||||
*/
|
||||
static uint64_t
|
||||
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
{
|
||||
arc_buf_hdr_t *hdr, *head, *marker;
|
||||
uint64_t write_asize, write_psize, headroom;
|
||||
boolean_t full, from_head = !arc_warm;
|
||||
l2arc_write_callback_t *cb = NULL;
|
||||
zio_t *pio, *wzio;
|
||||
uint64_t guid = spa_load_guid(spa);
|
||||
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
|
||||
|
||||
ASSERT3P(dev->l2ad_vdev, !=, NULL);
|
||||
|
||||
pio = NULL;
|
||||
write_asize = write_psize = 0;
|
||||
full = B_FALSE;
|
||||
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
||||
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
|
||||
marker = arc_state_alloc_marker();
|
||||
|
||||
/*
|
||||
* Copy buffers for L2ARC writing.
|
||||
*/
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
/*
|
||||
* pass == 0: MFU meta
|
||||
* pass == 1: MRU meta
|
||||
* pass == 2: MFU data
|
||||
* pass == 3: MRU data
|
||||
*/
|
||||
if (l2arc_mfuonly == 1) {
|
||||
if (pass == 1 || pass == 3)
|
||||
continue;
|
||||
} else if (l2arc_mfuonly > 1) {
|
||||
if (pass == 3)
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t passed_sz = 0;
|
||||
headroom = target_sz * l2arc_headroom;
|
||||
if (zfs_compressed_arc_enabled)
|
||||
headroom = (headroom * l2arc_headroom_boost) / 100;
|
||||
arc_buf_hdr_t *hdr;
|
||||
boolean_t full = B_FALSE;
|
||||
|
||||
/*
|
||||
* Until the ARC is warm and starts to evict, read from the
|
||||
* head of the ARC lists rather than the tail.
|
||||
*/
|
||||
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
|
||||
ASSERT3P(mls, !=, NULL);
|
||||
if (from_head)
|
||||
hdr = multilist_sublist_head(mls);
|
||||
else
|
||||
@ -9549,15 +9491,18 @@ skip:
|
||||
continue;
|
||||
}
|
||||
|
||||
passed_sz += HDR_GET_LSIZE(hdr);
|
||||
if (l2arc_headroom != 0 && passed_sz > headroom) {
|
||||
if (l2arc_headroom != 0 &&
|
||||
*consumed + HDR_GET_LSIZE(hdr) >
|
||||
MAX(sublist_headroom, HDR_GET_LSIZE(hdr))) {
|
||||
/*
|
||||
* Searched too far.
|
||||
* Searched too far in this sublist.
|
||||
*/
|
||||
mutex_exit(hash_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
*consumed += HDR_GET_LSIZE(hdr);
|
||||
|
||||
if (!l2arc_write_eligible(guid, hdr)) {
|
||||
mutex_exit(hash_lock);
|
||||
goto skip;
|
||||
@ -9566,18 +9511,16 @@ skip:
|
||||
ASSERT(HDR_HAS_L1HDR(hdr));
|
||||
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
|
||||
ASSERT3U(arc_hdr_size(hdr), >, 0);
|
||||
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
|
||||
HDR_HAS_RABD(hdr));
|
||||
ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
|
||||
uint64_t psize = HDR_GET_PSIZE(hdr);
|
||||
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
|
||||
psize);
|
||||
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
|
||||
|
||||
/*
|
||||
* If the allocated size of this buffer plus the max
|
||||
* size for the pending log block exceeds the evicted
|
||||
* target size, terminate writing buffers for this run.
|
||||
*/
|
||||
if (write_asize + asize +
|
||||
if (*write_asize + asize +
|
||||
sizeof (l2arc_log_blk_phys_t) > target_sz) {
|
||||
full = B_TRUE;
|
||||
mutex_exit(hash_lock);
|
||||
@ -9590,11 +9533,9 @@ skip:
|
||||
* the position and drop the lock.
|
||||
*/
|
||||
if (from_head) {
|
||||
multilist_sublist_insert_after(mls, hdr,
|
||||
marker);
|
||||
multilist_sublist_insert_after(mls, hdr, marker);
|
||||
} else {
|
||||
multilist_sublist_insert_before(mls, hdr,
|
||||
marker);
|
||||
multilist_sublist_insert_before(mls, hdr, marker);
|
||||
}
|
||||
multilist_sublist_unlock(mls);
|
||||
|
||||
@ -9626,8 +9567,7 @@ skip:
|
||||
ret = l2arc_apply_transforms(spa, hdr, asize,
|
||||
&to_write);
|
||||
if (ret != 0) {
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_L2CACHE);
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
|
||||
mutex_exit(hash_lock);
|
||||
goto next;
|
||||
}
|
||||
@ -9651,7 +9591,7 @@ skip:
|
||||
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
|
||||
|
||||
mutex_enter(&dev->l2ad_mtx);
|
||||
if (pio == NULL) {
|
||||
if (*pio == NULL) {
|
||||
/*
|
||||
* Insert a dummy header on the buflist so
|
||||
* l2arc_write_done() can find where the
|
||||
@ -9665,36 +9605,34 @@ skip:
|
||||
boolean_t commit = l2arc_log_blk_insert(dev, hdr);
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
if (pio == NULL) {
|
||||
cb = kmem_alloc(
|
||||
sizeof (l2arc_write_callback_t), KM_SLEEP);
|
||||
cb->l2wcb_dev = dev;
|
||||
cb->l2wcb_head = head;
|
||||
list_create(&cb->l2wcb_abd_list,
|
||||
if (*pio == NULL) {
|
||||
*cb = kmem_alloc(sizeof (l2arc_write_callback_t),
|
||||
KM_SLEEP);
|
||||
(*cb)->l2wcb_dev = dev;
|
||||
(*cb)->l2wcb_head = head;
|
||||
list_create(&(*cb)->l2wcb_abd_list,
|
||||
sizeof (l2arc_lb_abd_buf_t),
|
||||
offsetof(l2arc_lb_abd_buf_t, node));
|
||||
pio = zio_root(spa, l2arc_write_done, cb,
|
||||
*pio = zio_root(spa, l2arc_write_done, *cb,
|
||||
ZIO_FLAG_CANFAIL);
|
||||
}
|
||||
|
||||
wzio = zio_write_phys(pio, dev->l2ad_vdev,
|
||||
dev->l2ad_hand, asize, to_write,
|
||||
ZIO_CHECKSUM_OFF, NULL, hdr,
|
||||
ZIO_PRIORITY_ASYNC_WRITE,
|
||||
zio_t *wzio = zio_write_phys(*pio, dev->l2ad_vdev,
|
||||
dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF,
|
||||
NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE,
|
||||
ZIO_FLAG_CANFAIL, B_FALSE);
|
||||
|
||||
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
|
||||
zio_t *, wzio);
|
||||
zio_nowait(wzio);
|
||||
|
||||
write_psize += psize;
|
||||
write_asize += asize;
|
||||
*write_psize += psize;
|
||||
*write_asize += asize;
|
||||
dev->l2ad_hand += asize;
|
||||
|
||||
if (commit) {
|
||||
/* l2ad_hand will be adjusted inside. */
|
||||
write_asize +=
|
||||
l2arc_log_blk_commit(dev, pio, cb);
|
||||
*write_asize += l2arc_log_blk_commit(dev, *pio, *cb);
|
||||
}
|
||||
|
||||
next:
|
||||
@ -9706,7 +9644,105 @@ next:
|
||||
multilist_sublist_remove(mls, marker);
|
||||
}
|
||||
|
||||
return (full);
|
||||
}
|
||||
|
||||
static void
|
||||
l2arc_blk_fetch_done(zio_t *zio)
|
||||
{
|
||||
l2arc_read_callback_t *cb;
|
||||
|
||||
cb = zio->io_private;
|
||||
if (cb->l2rcb_abd != NULL)
|
||||
abd_free(cb->l2rcb_abd);
|
||||
kmem_free(cb, sizeof (l2arc_read_callback_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and write ARC buffers to the L2ARC device.
|
||||
*
|
||||
* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
|
||||
* for reading until they have completed writing.
|
||||
* The headroom_boost is an in-out parameter used to maintain headroom boost
|
||||
* state between calls to this function.
|
||||
*
|
||||
* Returns the number of bytes actually written (which may be smaller than
|
||||
* the delta by which the device hand has changed due to alignment and the
|
||||
* writing of log blocks).
|
||||
*/
|
||||
static uint64_t
|
||||
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
{
|
||||
arc_buf_hdr_t *head, *marker;
|
||||
uint64_t write_asize, write_psize, headroom;
|
||||
boolean_t full, from_head = !arc_warm;
|
||||
l2arc_write_callback_t *cb = NULL;
|
||||
zio_t *pio;
|
||||
uint64_t guid = spa_load_guid(spa);
|
||||
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
|
||||
|
||||
ASSERT3P(dev->l2ad_vdev, !=, NULL);
|
||||
|
||||
pio = NULL;
|
||||
write_asize = write_psize = 0;
|
||||
full = B_FALSE;
|
||||
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
||||
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
|
||||
marker = arc_state_alloc_marker();
|
||||
|
||||
/*
|
||||
* Copy buffers for L2ARC writing.
|
||||
*/
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
/*
|
||||
* pass == 0: MFU meta
|
||||
* pass == 1: MRU meta
|
||||
* pass == 2: MFU data
|
||||
* pass == 3: MRU data
|
||||
*/
|
||||
if (l2arc_mfuonly == 1) {
|
||||
if (pass == 1 || pass == 3)
|
||||
continue;
|
||||
} else if (l2arc_mfuonly > 1) {
|
||||
if (pass == 3)
|
||||
continue;
|
||||
}
|
||||
|
||||
headroom = target_sz * l2arc_headroom;
|
||||
if (zfs_compressed_arc_enabled)
|
||||
headroom = (headroom * l2arc_headroom_boost) / 100;
|
||||
|
||||
multilist_t *ml = l2arc_get_list(pass);
|
||||
ASSERT3P(ml, !=, NULL);
|
||||
int num_sublists = multilist_get_num_sublists(ml);
|
||||
int current_sublist = multilist_get_random_index(ml);
|
||||
uint64_t consumed_headroom = 0;
|
||||
|
||||
int processed_sublists = 0;
|
||||
while (processed_sublists < num_sublists && !full) {
|
||||
uint64_t sublist_headroom;
|
||||
multilist_sublist_t *mls;
|
||||
|
||||
if (consumed_headroom >= headroom)
|
||||
break;
|
||||
|
||||
sublist_headroom = (headroom - consumed_headroom) /
|
||||
(num_sublists - processed_sublists);
|
||||
|
||||
if (sublist_headroom == 0)
|
||||
break;
|
||||
mls = l2arc_sublist_lock(pass, current_sublist);
|
||||
ASSERT3P(mls, !=, NULL);
|
||||
|
||||
full = l2arc_write_sublist(spa, dev, mls, marker,
|
||||
from_head, target_sz, &write_asize, &write_psize,
|
||||
&pio, &cb, head, &consumed_headroom,
|
||||
sublist_headroom, guid);
|
||||
|
||||
multilist_sublist_unlock(mls);
|
||||
current_sublist = (current_sublist + 1) % num_sublists;
|
||||
processed_sublists++;
|
||||
}
|
||||
|
||||
if (full == B_TRUE)
|
||||
break;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user