mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version I called Uncached I/O. It uses normal DMU/ARC data path with some optimizations, but evicts data from caches as soon as possible and reasonable. Originally I wired it only to a primarycache property, but now completing the integration all the way up to the VFS. While Direct I/O has the lowest possible memory bandwidth usage, it also has a significant number of limitations. It require I/Os to be page aligned, does not allow speculative prefetch, etc. The Uncached I/O does not have those limitations, but instead require additional memory copy, though still one less than regular cached I/O. As such it should fill the gap in between. Considering this I've disabled annoying EINVAL errors on misaligned requests, adding a tunable for those who wants to test their applications. To pass the information between the layers I had to change a number of APIs. But as side effect upper layers can now control not only the caching, but also speculative prefetch. I haven't wired it to VFS yet, since it require looking on some OS specifics. But while there I've implemented speculative prefetch of indirect blocks for Direct I/O, controllable via all the same mechanisms. Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Fixes #17027 Reviewed-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
@@ -41,7 +41,6 @@
|
||||
#include <sys/dsl_pool.h>
|
||||
#include <sys/dsl_synctask.h>
|
||||
#include <sys/dsl_prop.h>
|
||||
#include <sys/dmu_zfetch.h>
|
||||
#include <sys/zfs_ioctl.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/zio_checksum.h>
|
||||
@@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
struct sf_buf *sf;
|
||||
int numbufs, i;
|
||||
int err;
|
||||
dmu_flags_t flags = 0;
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
@@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
|
||||
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
|
||||
|
||||
if (tocpy == db->db_size)
|
||||
if (tocpy == db->db_size) {
|
||||
dmu_buf_will_fill(db, tx, B_FALSE);
|
||||
else
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
} else {
|
||||
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
|
||||
if (bufoff == 0)
|
||||
flags |= DMU_PARTIAL_FIRST;
|
||||
else
|
||||
flags |= DMU_PARTIAL_MORE;
|
||||
}
|
||||
dmu_buf_will_dirty_flags(db, tx, flags);
|
||||
}
|
||||
|
||||
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
|
||||
ASSERT3U(ptoa((*ma)->pindex), ==,
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
#include <sys/racct.h>
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
curthread->td_ru.ru_inblock += iops;
|
||||
#ifdef RACCT
|
||||
@@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
curthread->td_ru.ru_oublock += iops;
|
||||
#ifdef RACCT
|
||||
|
||||
@@ -530,7 +530,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
||||
page_unhold(pp);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, bytes);
|
||||
uio, bytes, DMU_READ_PREFETCH);
|
||||
}
|
||||
len -= bytes;
|
||||
off = 0;
|
||||
|
||||
@@ -679,7 +679,7 @@ zvol_strategy_impl(zv_request_t *zvr)
|
||||
while (resid != 0 && off < volsize) {
|
||||
size_t size = MIN(resid, zvol_maxphys);
|
||||
if (doread) {
|
||||
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
|
||||
error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
|
||||
DMU_READ_PREFETCH);
|
||||
} else {
|
||||
dmu_tx_t *tx = dmu_tx_create(os);
|
||||
@@ -688,7 +688,8 @@ zvol_strategy_impl(zv_request_t *zvr)
|
||||
if (error) {
|
||||
dmu_tx_abort(tx);
|
||||
} else {
|
||||
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
|
||||
dmu_write_by_dnode(zv->zv_dn, off, size, addr,
|
||||
tx, DMU_READ_PREFETCH);
|
||||
zvol_log_write(zv, tx, off, size, commit);
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
@@ -834,7 +835,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
|
||||
if (bytes > volsize - zfs_uio_offset(&uio))
|
||||
bytes = volsize - zfs_uio_offset(&uio);
|
||||
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error) {
|
||||
/* Convert checksum errors into IO errors. */
|
||||
if (error == ECKSUM)
|
||||
@@ -893,7 +895,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
|
||||
dmu_tx_abort(tx);
|
||||
break;
|
||||
}
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error == 0)
|
||||
zvol_log_write(zv, tx, off, bytes, commit);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
@@ -30,14 +30,14 @@
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
task_io_account_read(size);
|
||||
spa_iostats_read_add(spa, size, iops, flags);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
task_io_account_write(size);
|
||||
spa_iostats_write_add(spa, size, iops, flags);
|
||||
@@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
#else
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
(void) spa, (void) size, (void) iops, (void) flags;
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
(void) spa, (void) size, (void) iops, (void) flags;
|
||||
}
|
||||
|
||||
@@ -329,7 +329,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
||||
put_page(pp);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, bytes);
|
||||
uio, bytes, DMU_READ_PREFETCH);
|
||||
}
|
||||
|
||||
len -= bytes;
|
||||
|
||||
@@ -258,7 +258,8 @@ zvol_write(zv_request_t *zvr)
|
||||
dmu_tx_abort(tx);
|
||||
break;
|
||||
}
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error == 0) {
|
||||
zvol_log_write(zv, tx, off, bytes, sync);
|
||||
}
|
||||
@@ -428,7 +429,8 @@ zvol_read(zv_request_t *zvr)
|
||||
if (bytes > volsize - uio.uio_loffset)
|
||||
bytes = volsize - uio.uio_loffset;
|
||||
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error) {
|
||||
/* convert checksum errors into IO errors */
|
||||
if (error == ECKSUM)
|
||||
|
||||
Reference in New Issue
Block a user