Wire O_DIRECT also to Uncached I/O (#17218)

Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O.  It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable.  Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.

While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations.  It require I/Os
to be page aligned, does not allow speculative prefetch, etc.  The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O.  As such it should fill the gap in between.  Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.

To pass the information between the layers I had to change a number
of APIs.  But as side effect upper layers can now control not only
the caching, but also speculative prefetch.  I haven't wired it to
VFS yet, since it require looking on some OS specifics.  But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Alexander Motin
2025-05-13 17:26:55 -04:00
committed by GitHub
parent e2ba0f7643
commit 734eba251d
35 changed files with 397 additions and 294 deletions
+12 -12
View File
@@ -1993,7 +1993,8 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
if (write_state == WR_COPIED &&
dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH |
DMU_KEEP_CACHING) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
write_state = WR_NEED_COPY;
@@ -2265,19 +2266,19 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
ASSERT(doi.doi_data_block_size);
ASSERT0(offset % doi.doi_data_block_size);
if (ztest_random(4) != 0) {
int prefetch = ztest_random(2) ?
dmu_flags_t flags = ztest_random(2) ?
DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
/*
* We will randomly set when to do O_DIRECT on a read.
*/
if (ztest_random(4) == 0)
prefetch |= DMU_DIRECTIO;
flags |= DMU_DIRECTIO;
ztest_block_tag_t rbt;
VERIFY(dmu_read(os, lr->lr_foid, offset,
sizeof (rbt), &rbt, prefetch) == 0);
sizeof (rbt), &rbt, flags) == 0);
if (rbt.bt_magic == BT_MAGIC) {
ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
offset, gen, txg, crtxg);
@@ -2308,7 +2309,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_write(os, lr->lr_foid, offset, length, data, tx);
} else {
memcpy(abuf->b_data, data, length);
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx));
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0));
}
(void) ztest_log_write(zd, tx, lr);
@@ -2533,7 +2534,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
ASSERT0(error);
} else {
ASSERT3P(zio, !=, NULL);
@@ -2549,7 +2550,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER);
error = dmu_buf_hold_noread(os, object, offset, zgd, &db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@@ -2826,7 +2826,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
enum ztest_io_type io_type;
uint64_t blocksize;
void *data;
uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH;
dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH;
/*
* We will randomly set when to do O_DIRECT on a read.
@@ -5065,7 +5065,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
uint64_t stride = 123456789ULL;
uint64_t width = 40;
int free_percent = 5;
uint32_t dmu_read_flags = DMU_READ_PREFETCH;
dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH;
/*
* We will randomly set when to do O_DIRECT on a read.
@@ -5541,13 +5541,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[j], tx));
off, bigbuf_arcbufs[j], tx, 0));
} else {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[2 * j], tx));
off, bigbuf_arcbufs[2 * j], tx, 0));
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx));
bigbuf_arcbufs[2 * j + 1], tx, 0));
}
if (i == 1) {
dmu_buf_rele(dbt, FTAG);