Implement uncached prefetch

Previously the primarycache property was handled only in the dbuf
layer. Since the speculative prefetcher is implemented in the ARC,
it had to be disabled for uncacheable buffers.

This change gives the ARC knowledge about uncacheable buffers
via  arc_read() and arc_write(). So when remove_reference() drops
the last reference on the ARC header, it can either immediately destroy
it, or if it is marked as prefetch, put it into a new arc_uncached state. 
That state is scanned every second, evicting stale buffers that were
not demand read.

This change also tracks dbufs that were read from the beginning,
but not to the end.  It is assumed that such buffers may receive further
reads, and so they are stored in dbuf cache. If a following
reads reaches the end of the buffer, it is immediately evicted.
Otherwise it will follow regular dbuf cache eviction.  Since the dbuf
layer does not know actual file sizes, this logic is not applied to
the final buffer of a dnode.

Since uncacheable buffers should no longer stay in the ARC for long,
this patch also tries to optimize I/O by allocating ARC physical
buffers as linear to allow buffer sharing.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #14243
This commit is contained in:
Alexander Motin
2023-01-04 19:29:54 -05:00
committed by GitHub
parent c935fe2e92
commit ed2f7ba08d
11 changed files with 243 additions and 139 deletions
+42 -57
View File
@@ -1608,7 +1608,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx);
if (dbuf_is_l2cacheable(db))
if (!DBUF_IS_CACHEABLE(db))
aflags |= ARC_FLAG_UNCACHED;
else if (dbuf_is_l2cacheable(db))
aflags |= ARC_FLAG_L2CACHE;
dbuf_add_ref(db, NULL);
@@ -1736,10 +1738,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dn = DB_DNODE(db);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
DBUF_IS_CACHEABLE(db);
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
mutex_enter(&db->db_mtx);
if (flags & DB_RF_PARTIAL_FIRST)
db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE))
db->db_partial_read = B_FALSE;
if (db->db_state == DB_CACHED) {
/*
* Ensure that this block's dnode has been decrypted if
@@ -3463,8 +3468,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
dpa->dpa_cb = cb;
dpa->dpa_arg = arg;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (dnode_level_is_l2cacheable(&bp, dn, level))
if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
else if (dnode_level_is_l2cacheable(&bp, dn, level))
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
/*
@@ -3853,59 +3859,38 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
} else {
boolean_t do_arc_evict = B_FALSE;
blkptr_t bp;
spa_t *spa = dmu_objset_spa(db->db_objset);
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
if (!DBUF_IS_CACHEABLE(db) &&
db->db_blkptr != NULL &&
!BP_IS_HOLE(db->db_blkptr) &&
!BP_IS_EMBEDDED(db->db_blkptr)) {
do_arc_evict = B_TRUE;
bp = *db->db_blkptr;
dbuf_cached_state_t dcs =
dbuf_include_in_metadata_cache(db) ?
DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
db->db_caching_status = dcs;
multilist_insert(&dbuf_caches[dcs].cache, db);
uint64_t db_size = db->db.db_size;
size = zfs_refcount_add_many(
&dbuf_caches[dcs].size, db_size, db);
uint8_t db_level = db->db_level;
mutex_exit(&db->db_mtx);
if (dcs == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMP(metadata_cache_count);
DBUF_STAT_MAX(metadata_cache_size_bytes_max,
size);
} else {
DBUF_STAT_BUMP(cache_count);
DBUF_STAT_MAX(cache_size_bytes_max, size);
DBUF_STAT_BUMP(cache_levels[db_level]);
DBUF_STAT_INCR(cache_levels_bytes[db_level],
db_size);
}
if (!DBUF_IS_CACHEABLE(db) ||
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
ASSERT3U(db->db_caching_status, ==,
DB_NO_CACHE);
dbuf_cached_state_t dcs =
dbuf_include_in_metadata_cache(db) ?
DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
db->db_caching_status = dcs;
multilist_insert(&dbuf_caches[dcs].cache, db);
uint64_t db_size = db->db.db_size;
size = zfs_refcount_add_many(
&dbuf_caches[dcs].size, db_size, db);
uint8_t db_level = db->db_level;
mutex_exit(&db->db_mtx);
if (dcs == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMP(metadata_cache_count);
DBUF_STAT_MAX(
metadata_cache_size_bytes_max,
size);
} else {
DBUF_STAT_BUMP(cache_count);
DBUF_STAT_MAX(cache_size_bytes_max,
size);
DBUF_STAT_BUMP(cache_levels[db_level]);
DBUF_STAT_INCR(
cache_levels_bytes[db_level],
db_size);
}
if (dcs == DB_DBUF_CACHE && !evicting)
dbuf_evict_notify(size);
}
if (do_arc_evict)
arc_freed(spa, &bp);
if (dcs == DB_DBUF_CACHE && !evicting)
dbuf_evict_notify(size);
}
} else {
mutex_exit(&db->db_mtx);
@@ -5083,8 +5068,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
children_ready_cb = dbuf_write_children_ready;
dr->dr_zio = arc_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
&zp, dbuf_write_ready,
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
children_ready_cb, dbuf_write_physdone,
dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED, &zb);