Wire O_DIRECT also to Uncached I/O (#17218)

Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O.  It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable.  Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.

While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations.  It require I/Os
to be page aligned, does not allow speculative prefetch, etc.  The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O.  As such it should fill the gap in between.  Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.

To pass the information between the layers I had to change a number
of APIs.  But as side effect upper layers can now control not only
the caching, but also speculative prefetch.  I haven't wired it to
VFS yet, since it require looking on some OS specifics.  But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Alexander Motin
2025-05-13 17:26:55 -04:00
committed by GitHub
parent e2ba0f7643
commit 734eba251d
35 changed files with 397 additions and 294 deletions
+55 -33
View File
@@ -1499,7 +1499,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/
static int
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,
dmu_flags_t flags)
{
objset_t *os = db->db_objset;
dmu_buf_impl_t *dndb;
@@ -1507,7 +1508,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
zbookmark_phys_t zb;
int err;
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
if ((flags & DMU_READ_NO_DECRYPT) != 0 ||
!os->os_encrypted || os->os_raw_receive ||
(dndb = dn->dn_dbuf) == NULL)
return (0);
@@ -1561,7 +1562,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
* returning.
*/
static int
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
{
zbookmark_phys_t zb;
@@ -1627,7 +1628,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
/*
@@ -1728,7 +1729,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
}
int
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)
{
dnode_t *dn;
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
@@ -1748,12 +1749,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
goto done;
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0;
(flags & DMU_READ_NO_PREFETCH) == 0;
mutex_enter(&db->db_mtx);
if (flags & DB_RF_PARTIAL_FIRST)
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
if (flags & DMU_PARTIAL_FIRST)
db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE))
else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))
db->db_partial_read = B_FALSE;
miss = (db->db_state != DB_CACHED);
@@ -1794,7 +1797,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if
* the key is now available.
*/
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@@ -1842,7 +1845,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
flags & DB_RF_HAVESTRUCT);
flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||
db->db_pending_evict);
}
DB_DNODE_EXIT(db);
@@ -1874,11 +1878,14 @@ done:
}
static void
dbuf_noread(dmu_buf_impl_t *db)
dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
{
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
db->db_partial_read = B_FALSE;
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
@@ -2191,8 +2198,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
kmem_free(dr, sizeof (*dr));
return (NULL);
}
int err = dbuf_read(parent_db, NULL,
(DB_RF_NOPREFETCH | DB_RF_CANFAIL));
int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |
DMU_READ_NO_PREFETCH);
if (err != 0) {
dbuf_rele(parent_db, FTAG);
kmem_free(dr, sizeof (*dr));
@@ -2620,8 +2627,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
static void
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
void
dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
boolean_t undirty = B_FALSE;
@@ -2673,7 +2680,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* not the uderlying block that is being replaced. dbuf_undirty() will
* do brt_pending_remove() before removing the dirty record.
*/
(void) dbuf_read(db, NULL, flags);
(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);
if (undirty) {
mutex_enter(&db->db_mtx);
VERIFY(!dbuf_undirty(db, tx));
@@ -2685,8 +2692,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
}
boolean_t
@@ -2850,7 +2856,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
dbuf_noread(db);
dbuf_noread(db, DMU_KEEP_CACHING);
(void) dbuf_dirty(db, tx);
}
@@ -2864,12 +2870,13 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
mutex_exit(&db->db_mtx);
dbuf_noread(db);
dbuf_noread(db, DMU_KEEP_CACHING);
(void) dbuf_dirty(db, tx);
}
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -2891,7 +2898,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
*/
if (canfail && dr) {
mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx);
dmu_buf_will_dirty_flags(db_fake, tx, flags);
return;
}
/*
@@ -2907,10 +2914,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
}
mutex_exit(&db->db_mtx);
dbuf_noread(db);
dbuf_noread(db, flags);
(void) dbuf_dirty(db, tx);
}
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);
}
/*
* This function is effectively the same as dmu_buf_will_dirty(), but
* indicates the caller expects raw encrypted data in the db, and provides
@@ -2933,8 +2946,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
ASSERT0(db->db_level);
ASSERT(db->db_objset->os_raw_receive);
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
dmu_buf_will_dirty_flags(db_fake, tx,
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
@@ -3076,7 +3089,8 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/
void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags)
{
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@@ -3090,6 +3104,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(arc_released(buf));
mutex_enter(&db->db_mtx);
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
db->db_partial_read = B_FALSE;
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
@@ -3344,8 +3361,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
if (err)
return (err);
err = dbuf_read(*parentp, NULL,
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |
DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);
if (err) {
dbuf_rele(*parentp, NULL);
*parentp = NULL;
@@ -3404,7 +3421,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_user = NULL;
db->db_user_immediate_evict = FALSE;
db->db_freed_in_flight = FALSE;
db->db_pending_evict = FALSE;
db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -3615,8 +3633,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dbuf_prefetch_fini(dpa, B_TRUE);
return;
}
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
DMU_READ_NO_PREFETCH);
dbuf_rele(db, FTAG);
}
@@ -4002,6 +4020,7 @@ dbuf_create_bonus(dnode_t *dn)
ASSERT(dn->dn_bonus == NULL);
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
dn->dn_bonus->db_pending_evict = FALSE;
}
int
@@ -4167,8 +4186,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
db->db_pending_evict) {
} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {
/*
* We don't expect more accesses to the dbuf, and it
* is either not cacheable or was marked for eviction.
*/
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);