mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 10:54:35 +03:00
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version I called Uncached I/O. It uses normal DMU/ARC data path with some optimizations, but evicts data from caches as soon as possible and reasonable. Originally I wired it only to a primarycache property, but now completing the integration all the way up to the VFS. While Direct I/O has the lowest possible memory bandwidth usage, it also has a significant number of limitations. It require I/Os to be page aligned, does not allow speculative prefetch, etc. The Uncached I/O does not have those limitations, but instead require additional memory copy, though still one less than regular cached I/O. As such it should fill the gap in between. Considering this I've disabled annoying EINVAL errors on misaligned requests, adding a tunable for those who wants to test their applications. To pass the information between the layers I had to change a number of APIs. But as side effect upper layers can now control not only the caching, but also speculative prefetch. I haven't wired it to VFS yet, since it require looking on some OS specifics. But while there I've implemented speculative prefetch of indirect blocks for Direct I/O, controllable via all the same mechanisms. Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Fixes #17027 Reviewed-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
+55
-33
@@ -1499,7 +1499,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
||||
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
objset_t *os = db->db_objset;
|
||||
dmu_buf_impl_t *dndb;
|
||||
@@ -1507,7 +1508,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
zbookmark_phys_t zb;
|
||||
int err;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
||||
if ((flags & DMU_READ_NO_DECRYPT) != 0 ||
|
||||
!os->os_encrypted || os->os_raw_receive ||
|
||||
(dndb = dn->dn_dbuf) == NULL)
|
||||
return (0);
|
||||
@@ -1561,7 +1562,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
* returning.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
|
||||
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
|
||||
{
|
||||
zbookmark_phys_t zb;
|
||||
@@ -1627,7 +1628,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
zio_flags = (flags & DB_RF_CANFAIL) ?
|
||||
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
|
||||
if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
|
||||
/*
|
||||
@@ -1728,7 +1729,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
||||
}
|
||||
|
||||
int
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)
|
||||
{
|
||||
dnode_t *dn;
|
||||
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
|
||||
@@ -1748,12 +1749,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
goto done;
|
||||
|
||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||
(flags & DB_RF_NOPREFETCH) == 0;
|
||||
(flags & DMU_READ_NO_PREFETCH) == 0;
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (flags & DB_RF_PARTIAL_FIRST)
|
||||
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
|
||||
db->db_pending_evict = B_FALSE;
|
||||
if (flags & DMU_PARTIAL_FIRST)
|
||||
db->db_partial_read = B_TRUE;
|
||||
else if (!(flags & DB_RF_PARTIAL_MORE))
|
||||
else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))
|
||||
db->db_partial_read = B_FALSE;
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
|
||||
@@ -1794,7 +1797,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
* unauthenticated blocks, which will verify their MAC if
|
||||
* the key is now available.
|
||||
*/
|
||||
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||
if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||
(arc_is_encrypted(db->db_buf) ||
|
||||
arc_is_unauthenticated(db->db_buf) ||
|
||||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
||||
@@ -1842,7 +1845,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
|
||||
if (err == 0 && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
|
||||
flags & DB_RF_HAVESTRUCT);
|
||||
flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||
|
||||
db->db_pending_evict);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
@@ -1874,11 +1878,14 @@ done:
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_noread(dmu_buf_impl_t *db)
|
||||
dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
|
||||
{
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
|
||||
db->db_pending_evict = B_FALSE;
|
||||
db->db_partial_read = B_FALSE;
|
||||
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
if (db->db_state == DB_UNCACHED) {
|
||||
@@ -2191,8 +2198,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
|
||||
kmem_free(dr, sizeof (*dr));
|
||||
return (NULL);
|
||||
}
|
||||
int err = dbuf_read(parent_db, NULL,
|
||||
(DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
||||
int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |
|
||||
DMU_READ_NO_PREFETCH);
|
||||
if (err != 0) {
|
||||
dbuf_rele(parent_db, FTAG);
|
||||
kmem_free(dr, sizeof (*dr));
|
||||
@@ -2620,8 +2627,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
static void
|
||||
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
||||
void
|
||||
dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
boolean_t undirty = B_FALSE;
|
||||
@@ -2673,7 +2680,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
||||
* not the uderlying block that is being replaced. dbuf_undirty() will
|
||||
* do brt_pending_remove() before removing the dirty record.
|
||||
*/
|
||||
(void) dbuf_read(db, NULL, flags);
|
||||
(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);
|
||||
if (undirty) {
|
||||
mutex_enter(&db->db_mtx);
|
||||
VERIFY(!dbuf_undirty(db, tx));
|
||||
@@ -2685,8 +2692,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
||||
void
|
||||
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_will_dirty_impl(db_fake,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
|
||||
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
@@ -2850,7 +2856,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
DBUF_VERIFY(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
dbuf_noread(db, DMU_KEEP_CACHING);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
}
|
||||
|
||||
@@ -2864,12 +2870,13 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
dbuf_noread(db, DMU_KEEP_CACHING);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
|
||||
@@ -2891,7 +2898,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
*/
|
||||
if (canfail && dr) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
dmu_buf_will_dirty(db_fake, tx);
|
||||
dmu_buf_will_dirty_flags(db_fake, tx, flags);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
@@ -2907,10 +2914,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
dbuf_noread(db, flags);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
{
|
||||
dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is effectively the same as dmu_buf_will_dirty(), but
|
||||
* indicates the caller expects raw encrypted data in the db, and provides
|
||||
@@ -2933,8 +2946,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
|
||||
ASSERT0(db->db_level);
|
||||
ASSERT(db->db_objset->os_raw_receive);
|
||||
|
||||
dmu_buf_will_dirty_impl(db_fake,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
|
||||
dmu_buf_will_dirty_flags(db_fake, tx,
|
||||
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
|
||||
|
||||
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
||||
|
||||
@@ -3076,7 +3089,8 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
|
||||
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
|
||||
*/
|
||||
void
|
||||
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
@@ -3090,6 +3104,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
ASSERT(arc_released(buf));
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
|
||||
db->db_pending_evict = B_FALSE;
|
||||
db->db_partial_read = B_FALSE;
|
||||
|
||||
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
@@ -3344,8 +3361,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
||||
|
||||
if (err)
|
||||
return (err);
|
||||
err = dbuf_read(*parentp, NULL,
|
||||
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
||||
err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |
|
||||
DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);
|
||||
if (err) {
|
||||
dbuf_rele(*parentp, NULL);
|
||||
*parentp = NULL;
|
||||
@@ -3404,7 +3421,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
db->db_user = NULL;
|
||||
db->db_user_immediate_evict = FALSE;
|
||||
db->db_freed_in_flight = FALSE;
|
||||
db->db_pending_evict = FALSE;
|
||||
db->db_pending_evict = TRUE;
|
||||
db->db_partial_read = FALSE;
|
||||
|
||||
if (blkid == DMU_BONUS_BLKID) {
|
||||
ASSERT3P(parent, ==, dn->dn_dbuf);
|
||||
@@ -3615,8 +3633,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
||||
dbuf_prefetch_fini(dpa, B_TRUE);
|
||||
return;
|
||||
}
|
||||
(void) dbuf_read(db, NULL,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
|
||||
(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
|
||||
DMU_READ_NO_PREFETCH);
|
||||
dbuf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
@@ -4002,6 +4020,7 @@ dbuf_create_bonus(dnode_t *dn)
|
||||
ASSERT(dn->dn_bonus == NULL);
|
||||
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
|
||||
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
|
||||
dn->dn_bonus->db_pending_evict = FALSE;
|
||||
}
|
||||
|
||||
int
|
||||
@@ -4167,8 +4186,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
|
||||
* This dbuf has anonymous data associated with it.
|
||||
*/
|
||||
dbuf_destroy(db);
|
||||
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
|
||||
db->db_pending_evict) {
|
||||
} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {
|
||||
/*
|
||||
* We don't expect more accesses to the dbuf, and it
|
||||
* is either not cacheable or was marked for eviction.
|
||||
*/
|
||||
dbuf_destroy(db);
|
||||
} else if (!multilist_link_active(&db->db_cache_link)) {
|
||||
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
|
||||
|
||||
Reference in New Issue
Block a user