Wire O_DIRECT also to Uncached I/O (#17218)

Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O.  It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable.  Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.

While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations.  It require I/Os
to be page aligned, does not allow speculative prefetch, etc.  The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O.  As such it should fill the gap in between.  Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.

To pass the information between the layers I had to change a number
of APIs.  But as side effect upper layers can now control not only
the caching, but also speculative prefetch.  I haven't wired it to
VFS yet, since it require looking on some OS specifics.  But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Alexander Motin
2025-05-13 17:26:55 -04:00
committed by GitHub
parent e2ba0f7643
commit 734eba251d
35 changed files with 397 additions and 294 deletions
+4 -4
View File
@@ -30,14 +30,14 @@
#include <linux/task_io_accounting_ops.h>
void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
task_io_account_read(size);
spa_iostats_read_add(spa, size, iops, flags);
}
void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
task_io_account_write(size);
spa_iostats_write_add(spa, size, iops, flags);
@@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
#else
void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
+1 -1
View File
@@ -329,7 +329,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
put_page(pp);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes);
uio, bytes, DMU_READ_PREFETCH);
}
len -= bytes;
+4 -2
View File
@@ -258,7 +258,8 @@ zvol_write(zv_request_t *zvr)
dmu_tx_abort(tx);
break;
}
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
DMU_READ_PREFETCH);
if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
}
@@ -428,7 +429,8 @@ zvol_read(zv_request_t *zvr)
if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio.uio_loffset;
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
DMU_READ_PREFETCH);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)