Wire O_DIRECT also to Uncached I/O (#17218)

Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O.  It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable.  Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.

While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations.  It require I/Os
to be page aligned, does not allow speculative prefetch, etc.  The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O.  As such it should fill the gap in between.  Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.

To pass the information between the layers I had to change a number
of APIs.  But as side effect upper layers can now control not only
the caching, but also speculative prefetch.  I haven't wired it to
VFS yet, since it require looking on some OS specifics.  But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Alexander Motin
2025-05-13 17:26:55 -04:00
committed by GitHub
parent e2ba0f7643
commit 734eba251d
35 changed files with 397 additions and 294 deletions
+7 -18
View File
@@ -45,20 +45,6 @@ extern "C" {
#define IN_DMU_SYNC 2
/*
* define flags for dbuf_read
*/
#define DB_RF_MUST_SUCCEED (1 << 0)
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
#define DB_RF_NO_DECRYPT (1 << 6)
#define DB_RF_PARTIAL_FIRST (1 << 7)
#define DB_RF_PARTIAL_MORE (1 << 8)
/*
* The simplified state transition diagram for dbufs looks like:
*
@@ -389,12 +375,15 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag,
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid, uint64_t *hash_out);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, dmu_flags_t flags);
void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
void dmu_buf_will_fill_flags(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags);
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
dmu_tx_t *tx);
@@ -475,10 +464,10 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
#define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
#define DBUF_IS_CACHEABLE(_db) \
#define DBUF_IS_CACHEABLE(_db) (!(_db)->db_pending_evict && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))))
boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp);
+42 -25
View File
@@ -532,6 +532,26 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp);
/*
* DB_RF_* are to be used for dbuf_read() or in limited other cases.
*/
typedef enum dmu_flags {
DB_RF_MUST_SUCCEED = 0, /* Suspend on I/O errors. */
DB_RF_CANFAIL = 1 << 0, /* Return on I/O errors. */
DB_RF_HAVESTRUCT = 1 << 1, /* dn_struct_rwlock is locked. */
DB_RF_NEVERWAIT = 1 << 2,
DMU_READ_PREFETCH = 0, /* Try speculative prefetch. */
DMU_READ_NO_PREFETCH = 1 << 3, /* Don't prefetch speculatively. */
DB_RF_NOPREFETCH = DMU_READ_NO_PREFETCH,
DMU_READ_NO_DECRYPT = 1 << 4, /* Don't decrypt. */
DB_RF_NO_DECRYPT = DMU_READ_NO_DECRYPT,
DMU_DIRECTIO = 1 << 5, /* Bypass ARC. */
DMU_UNCACHEDIO = 1 << 6, /* Reduce caching. */
DMU_PARTIAL_FIRST = 1 << 7, /* First partial access. */
DMU_PARTIAL_MORE = 1 << 8, /* Following partial access. */
DMU_KEEP_CACHING = 1 << 9, /* Don't affect caching. */
} dmu_flags_t;
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -547,7 +567,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
dmu_buf_t **dbp);
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags);
dmu_flags_t flags);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
@@ -558,9 +578,9 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
* Special spill buffer support used by "SA" framework
*/
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
@@ -579,17 +599,17 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
* The object number must be a valid, allocated object number.
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **, int flags);
const void *tag, dmu_buf_t **, dmu_flags_t flags);
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags);
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);
dmu_buf_t ***dbpp, dmu_flags_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp);
@@ -781,6 +801,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
*/
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags);
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
@@ -874,40 +895,36 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
* Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error.
*/
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
#define DMU_DIRECTIO 4 /* use Direct I/O */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
void *buf, dmu_flags_t flags);
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags);
dmu_flags_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, uint32_t flags);
const void *buf, dmu_tx_t *tx, dmu_flags_t flags);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
#ifdef _KERNEL
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx);
struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx);
struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf
extern uint_t zfs_max_recordsize;
+6 -4
View File
@@ -270,11 +270,13 @@ void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t,
dmu_tx_t *);
#if defined(_KERNEL)
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *);
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t,
dmu_tx_t *);
#endif
#ifdef __cplusplus
+3 -2
View File
@@ -81,9 +81,10 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t,
boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t, boolean_t);
#ifdef __cplusplus
+2 -2
View File
@@ -981,9 +981,9 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
uint64_t extents_skipped, uint64_t bytes_skipped,
uint64_t extents_failed, uint64_t bytes_failed);
extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags);
dmu_flags_t flags);
extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags);
dmu_flags_t flags);
extern void spa_import_progress_add(spa_t *spa);
extern void spa_import_progress_remove(uint64_t spa_guid);
extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
+4 -2
View File
@@ -33,7 +33,9 @@
/*
* Platform-dependent resource accounting hooks
*/
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);
#endif /* _SYS_ZFS_RACCT_H */