Wire O_DIRECT also to Uncached I/O (#17218)

Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O.  It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable.  Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.

While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations.  It require I/Os
to be page aligned, does not allow speculative prefetch, etc.  The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O.  As such it should fill the gap in between.  Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.

To pass the information between the layers I had to change a number
of APIs.  But as side effect upper layers can now control not only
the caching, but also speculative prefetch.  I haven't wired it to
VFS yet, since it require looking on some OS specifics.  But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Alexander Motin 2025-05-13 17:26:55 -04:00 committed by GitHub
parent e2ba0f7643
commit 734eba251d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
35 changed files with 397 additions and 294 deletions

View File

@ -1993,7 +1993,8 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
if (write_state == WR_COPIED &&
dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH |
DMU_KEEP_CACHING) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
write_state = WR_NEED_COPY;
@ -2265,19 +2266,19 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
ASSERT(doi.doi_data_block_size);
ASSERT0(offset % doi.doi_data_block_size);
if (ztest_random(4) != 0) {
int prefetch = ztest_random(2) ?
dmu_flags_t flags = ztest_random(2) ?
DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
/*
* We will randomly set when to do O_DIRECT on a read.
*/
if (ztest_random(4) == 0)
prefetch |= DMU_DIRECTIO;
flags |= DMU_DIRECTIO;
ztest_block_tag_t rbt;
VERIFY(dmu_read(os, lr->lr_foid, offset,
sizeof (rbt), &rbt, prefetch) == 0);
sizeof (rbt), &rbt, flags) == 0);
if (rbt.bt_magic == BT_MAGIC) {
ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
offset, gen, txg, crtxg);
@ -2308,7 +2309,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_write(os, lr->lr_foid, offset, length, data, tx);
} else {
memcpy(abuf->b_data, data, length);
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx));
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0));
}
(void) ztest_log_write(zd, tx, lr);
@ -2533,7 +2534,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
ASSERT0(error);
} else {
ASSERT3P(zio, !=, NULL);
@ -2549,7 +2550,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER);
error = dmu_buf_hold_noread(os, object, offset, zgd, &db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@ -2826,7 +2826,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
enum ztest_io_type io_type;
uint64_t blocksize;
void *data;
uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH;
dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH;
/*
* We will randomly set when to do O_DIRECT on a read.
@ -5065,7 +5065,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
uint64_t stride = 123456789ULL;
uint64_t width = 40;
int free_percent = 5;
uint32_t dmu_read_flags = DMU_READ_PREFETCH;
dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH;
/*
* We will randomly set when to do O_DIRECT on a read.
@ -5541,13 +5541,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[j], tx));
off, bigbuf_arcbufs[j], tx, 0));
} else {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[2 * j], tx));
off, bigbuf_arcbufs[2 * j], tx, 0));
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx));
bigbuf_arcbufs[2 * j + 1], tx, 0));
}
if (i == 1) {
dmu_buf_rele(dbt, FTAG);

View File

@ -45,20 +45,6 @@ extern "C" {
#define IN_DMU_SYNC 2
/*
* define flags for dbuf_read
*/
#define DB_RF_MUST_SUCCEED (1 << 0)
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
#define DB_RF_NO_DECRYPT (1 << 6)
#define DB_RF_PARTIAL_FIRST (1 << 7)
#define DB_RF_PARTIAL_MORE (1 << 8)
/*
* The simplified state transition diagram for dbufs looks like:
*
@ -389,12 +375,15 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag,
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid, uint64_t *hash_out);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, dmu_flags_t flags);
void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
void dmu_buf_will_fill_flags(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags);
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
dmu_tx_t *tx);
@ -475,10 +464,10 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
#define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
#define DBUF_IS_CACHEABLE(_db) \
#define DBUF_IS_CACHEABLE(_db) (!(_db)->db_pending_evict && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))))
boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp);

View File

@ -532,6 +532,26 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp);
/*
* DB_RF_* are to be used for dbuf_read() or in limited other cases.
*/
typedef enum dmu_flags {
DB_RF_MUST_SUCCEED = 0, /* Suspend on I/O errors. */
DB_RF_CANFAIL = 1 << 0, /* Return on I/O errors. */
DB_RF_HAVESTRUCT = 1 << 1, /* dn_struct_rwlock is locked. */
DB_RF_NEVERWAIT = 1 << 2,
DMU_READ_PREFETCH = 0, /* Try speculative prefetch. */
DMU_READ_NO_PREFETCH = 1 << 3, /* Don't prefetch speculatively. */
DB_RF_NOPREFETCH = DMU_READ_NO_PREFETCH,
DMU_READ_NO_DECRYPT = 1 << 4, /* Don't decrypt. */
DB_RF_NO_DECRYPT = DMU_READ_NO_DECRYPT,
DMU_DIRECTIO = 1 << 5, /* Bypass ARC. */
DMU_UNCACHEDIO = 1 << 6, /* Reduce caching. */
DMU_PARTIAL_FIRST = 1 << 7, /* First partial access. */
DMU_PARTIAL_MORE = 1 << 8, /* Following partial access. */
DMU_KEEP_CACHING = 1 << 9, /* Don't affect caching. */
} dmu_flags_t;
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
@ -547,7 +567,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
dmu_buf_t **dbp);
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags);
dmu_flags_t flags);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
@ -558,9 +578,9 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
* Special spill buffer support used by "SA" framework
*/
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
@ -579,17 +599,17 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
* The object number must be a valid, allocated object number.
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **, int flags);
const void *tag, dmu_buf_t **, dmu_flags_t flags);
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags);
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);
dmu_buf_t ***dbpp, dmu_flags_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp);
@ -781,6 +801,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
*/
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags);
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
@ -874,40 +895,36 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
* Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error.
*/
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
#define DMU_DIRECTIO 4 /* use Direct I/O */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
void *buf, dmu_flags_t flags);
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags);
dmu_flags_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, uint32_t flags);
const void *buf, dmu_tx_t *tx, dmu_flags_t flags);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
#ifdef _KERNEL
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size);
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, dmu_flags_t flags);
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx);
struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx);
struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf
extern uint_t zfs_max_recordsize;

View File

@ -270,11 +270,13 @@ void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t,
dmu_tx_t *);
#if defined(_KERNEL)
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *);
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t,
dmu_tx_t *);
#endif
#ifdef __cplusplus

View File

@ -81,9 +81,10 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t,
boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t, boolean_t);
#ifdef __cplusplus

View File

@ -981,9 +981,9 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
uint64_t extents_skipped, uint64_t bytes_skipped,
uint64_t extents_failed, uint64_t bytes_failed);
extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags);
dmu_flags_t flags);
extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags);
dmu_flags_t flags);
extern void spa_import_progress_add(spa_t *spa);
extern void spa_import_progress_remove(uint64_t spa_guid);
extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,

View File

@ -33,7 +33,9 @@
/*
* Platform-dependent resource accounting hooks
*/
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);
#endif /* _SYS_ZFS_RACCT_H */

View File

@ -27,13 +27,13 @@
#include <sys/zfs_racct.h>
void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}

View File

@ -304,7 +304,7 @@ Default dnode block size as a power of 2.
.It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int
Default dnode indirect block size as a power of 2.
.
.It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int
.It Sy zfs_dio_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Enable Direct I/O.
If this setting is 0, then all I/O requests will be directed through the ARC
acting as though the dataset property
@ -312,6 +312,11 @@ acting as though the dataset property
was set to
.Sy disabled .
.
.It Sy zfs_dio_strict Ns = Ns Sy 0 Ns | Ns 1 Pq int
Strictly enforce alignment for Direct I/O requests, returning
.Sy EINVAL
if not page-aligned instead of silently falling back to uncached I/O.
.
.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
When attempting to log an output nvlist of an ioctl in the on-disk history,
the output will not be stored if it is larger than this size (in bytes).

View File

@ -41,7 +41,6 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_prop.h>
#include <sys/dmu_zfetch.h>
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
struct sf_buf *sf;
int numbufs, i;
int err;
dmu_flags_t flags = 0;
if (size == 0)
return (0);
@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
if (tocpy == db->db_size) {
dmu_buf_will_fill(db, tx, B_FALSE);
else
dmu_buf_will_dirty(db, tx);
} else {
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
if (bufoff == 0)
flags |= DMU_PARTIAL_FIRST;
else
flags |= DMU_PARTIAL_MORE;
}
dmu_buf_will_dirty_flags(db, tx, flags);
}
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(ptoa((*ma)->pindex), ==,

View File

@ -28,7 +28,7 @@
#include <sys/racct.h>
void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
curthread->td_ru.ru_inblock += iops;
#ifdef RACCT
@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
}
void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
curthread->td_ru.ru_oublock += iops;
#ifdef RACCT

View File

@ -530,7 +530,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
page_unhold(pp);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes);
uio, bytes, DMU_READ_PREFETCH);
}
len -= bytes;
off = 0;

View File

@ -679,7 +679,7 @@ zvol_strategy_impl(zv_request_t *zvr)
while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys);
if (doread) {
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
DMU_READ_PREFETCH);
} else {
dmu_tx_t *tx = dmu_tx_create(os);
@ -688,7 +688,8 @@ zvol_strategy_impl(zv_request_t *zvr)
if (error) {
dmu_tx_abort(tx);
} else {
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
dmu_write_by_dnode(zv->zv_dn, off, size, addr,
tx, DMU_READ_PREFETCH);
zvol_log_write(zv, tx, off, size, commit);
dmu_tx_commit(tx);
}
@ -834,7 +835,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
if (bytes > volsize - zfs_uio_offset(&uio))
bytes = volsize - zfs_uio_offset(&uio);
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
DMU_READ_PREFETCH);
if (error) {
/* Convert checksum errors into IO errors. */
if (error == ECKSUM)
@ -893,7 +895,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
dmu_tx_abort(tx);
break;
}
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
DMU_READ_PREFETCH);
if (error == 0)
zvol_log_write(zv, tx, off, bytes, commit);
dmu_tx_commit(tx);

View File

@ -30,14 +30,14 @@
#include <linux/task_io_accounting_ops.h>
void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
task_io_account_read(size);
spa_iostats_read_add(spa, size, iops, flags);
}
void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
task_io_account_write(size);
spa_iostats_write_add(spa, size, iops, flags);
@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
#else
void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}
void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{
(void) spa, (void) size, (void) iops, (void) flags;
}

View File

@ -329,7 +329,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
put_page(pp);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes);
uio, bytes, DMU_READ_PREFETCH);
}
len -= bytes;

View File

@ -258,7 +258,8 @@ zvol_write(zv_request_t *zvr)
dmu_tx_abort(tx);
break;
}
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
DMU_READ_PREFETCH);
if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
}
@ -428,7 +429,8 @@ zvol_read(zv_request_t *zvr)
if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio.uio_loffset;
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
DMU_READ_PREFETCH);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)

View File

@ -6103,7 +6103,9 @@ top:
ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
metadata, misses);
zfs_racct_read(spa, size, 1, 0);
zfs_racct_read(spa, size, 1,
(*arc_flags & ARC_FLAG_UNCACHED) ?
DMU_UNCACHEDIO : 0);
}
/* Check if the spa even has l2 configured */

View File

@ -1499,7 +1499,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/
static int
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,
dmu_flags_t flags)
{
objset_t *os = db->db_objset;
dmu_buf_impl_t *dndb;
@ -1507,7 +1508,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
zbookmark_phys_t zb;
int err;
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
if ((flags & DMU_READ_NO_DECRYPT) != 0 ||
!os->os_encrypted || os->os_raw_receive ||
(dndb = dn->dn_dbuf) == NULL)
return (0);
@ -1561,7 +1562,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
* returning.
*/
static int
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
{
zbookmark_phys_t zb;
@ -1627,7 +1628,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
/*
@ -1728,7 +1729,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
}
int
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)
{
dnode_t *dn;
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
@ -1748,12 +1749,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
goto done;
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0;
(flags & DMU_READ_NO_PREFETCH) == 0;
mutex_enter(&db->db_mtx);
if (flags & DB_RF_PARTIAL_FIRST)
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
if (flags & DMU_PARTIAL_FIRST)
db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE))
else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))
db->db_partial_read = B_FALSE;
miss = (db->db_state != DB_CACHED);
@ -1794,7 +1797,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if
* the key is now available.
*/
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@ -1842,7 +1845,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
flags & DB_RF_HAVESTRUCT);
flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||
db->db_pending_evict);
}
DB_DNODE_EXIT(db);
@ -1874,11 +1878,14 @@ done:
}
static void
dbuf_noread(dmu_buf_impl_t *db)
dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
{
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
db->db_partial_read = B_FALSE;
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
@ -2191,8 +2198,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
kmem_free(dr, sizeof (*dr));
return (NULL);
}
int err = dbuf_read(parent_db, NULL,
(DB_RF_NOPREFETCH | DB_RF_CANFAIL));
int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |
DMU_READ_NO_PREFETCH);
if (err != 0) {
dbuf_rele(parent_db, FTAG);
kmem_free(dr, sizeof (*dr));
@ -2620,8 +2627,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
static void
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
void
dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
boolean_t undirty = B_FALSE;
@ -2673,7 +2680,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* not the uderlying block that is being replaced. dbuf_undirty() will
* do brt_pending_remove() before removing the dirty record.
*/
(void) dbuf_read(db, NULL, flags);
(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);
if (undirty) {
mutex_enter(&db->db_mtx);
VERIFY(!dbuf_undirty(db, tx));
@ -2685,8 +2692,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
}
boolean_t
@ -2850,7 +2856,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
dbuf_noread(db);
dbuf_noread(db, DMU_KEEP_CACHING);
(void) dbuf_dirty(db, tx);
}
@ -2864,12 +2870,13 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
mutex_exit(&db->db_mtx);
dbuf_noread(db);
dbuf_noread(db, DMU_KEEP_CACHING);
(void) dbuf_dirty(db, tx);
}
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@ -2891,7 +2898,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
*/
if (canfail && dr) {
mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx);
dmu_buf_will_dirty_flags(db_fake, tx, flags);
return;
}
/*
@ -2907,10 +2914,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
}
mutex_exit(&db->db_mtx);
dbuf_noread(db);
dbuf_noread(db, flags);
(void) dbuf_dirty(db, tx);
}
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);
}
/*
* This function is effectively the same as dmu_buf_will_dirty(), but
* indicates the caller expects raw encrypted data in the db, and provides
@ -2933,8 +2946,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
ASSERT0(db->db_level);
ASSERT(db->db_objset->os_raw_receive);
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
dmu_buf_will_dirty_flags(db_fake, tx,
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
@ -3076,7 +3089,8 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/
void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags)
{
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@ -3090,6 +3104,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(arc_released(buf));
mutex_enter(&db->db_mtx);
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
db->db_partial_read = B_FALSE;
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
@ -3344,8 +3361,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
if (err)
return (err);
err = dbuf_read(*parentp, NULL,
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |
DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);
if (err) {
dbuf_rele(*parentp, NULL);
*parentp = NULL;
@ -3404,7 +3421,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_user = NULL;
db->db_user_immediate_evict = FALSE;
db->db_freed_in_flight = FALSE;
db->db_pending_evict = FALSE;
db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
@ -3615,8 +3633,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dbuf_prefetch_fini(dpa, B_TRUE);
return;
}
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
DMU_READ_NO_PREFETCH);
dbuf_rele(db, FTAG);
}
@ -4002,6 +4020,7 @@ dbuf_create_bonus(dnode_t *dn)
ASSERT(dn->dn_bonus == NULL);
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
dn->dn_bonus->db_pending_evict = FALSE;
}
int
@ -4167,8 +4186,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
db->db_pending_evict) {
} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {
/*
* We don't expect more accesses to the dbuf, and it
* is either not cacheable or was marked for eviction.
*/
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);

View File

@ -222,20 +222,14 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags)
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
@ -247,20 +241,14 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags)
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
if (err != 0) {
dbuf_rele(db, tag);
*dbp = NULL;
@ -358,16 +346,10 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* Returns ENOENT, EIO, or 0.
*/
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags)
dmu_flags_t flags)
{
dmu_buf_impl_t *db;
int error;
uint32_t db_flags = DB_RF_MUST_SUCCEED;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
@ -393,7 +375,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
*/
rw_exit(&dn->dn_struct_rwlock);
error = dbuf_read(db, NULL, db_flags);
error = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
if (error) {
dnode_evict_bonus(dn);
dbuf_rele(db, tag);
@ -431,7 +413,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
* dmu_spill_hold_existing() should be used.
*/
int
dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag,
dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = NULL;
@ -489,18 +471,14 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
}
int
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, const void *tag,
dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
int err;
uint32_t db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
DB_DNODE_ENTER(db);
err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp);
err = dmu_spill_hold_by_dnode(DB_DNODE(db), flags, tag, dbp);
DB_DNODE_EXIT(db);
return (err);
@ -515,12 +493,12 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
uint32_t flags)
dmu_flags_t flags)
{
dmu_buf_t **dbp;
zstream_t *zs = NULL;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
dmu_flags_t dbuf_flags;
int err;
zio_t *zio = NULL;
boolean_t missed = B_FALSE;
@ -532,11 +510,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
* we can tell it about the multi-block read. dbuf_read() only knows
* about the one block it is accessing.
*/
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;
if ((flags & DMU_READ_NO_DECRYPT) != 0)
dbuf_flags |= DB_RF_NO_DECRYPT;
dbuf_flags = (flags & ~DMU_READ_PREFETCH) | DMU_READ_NO_PREFETCH |
DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
@ -569,15 +544,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
* that if multiple threads block on same indirect block, we
* base predictions on the original less racy request order.
*/
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
B_TRUE);
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
read && !(flags & DMU_DIRECTIO), B_TRUE);
}
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
if (zs) {
dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
B_TRUE);
B_TRUE, (flags & DMU_UNCACHEDIO));
}
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
@ -599,9 +574,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
offset + length < db->db.db_offset +
db->db.db_size) {
if (offset <= db->db.db_offset)
dbuf_flags |= DB_RF_PARTIAL_FIRST;
dbuf_flags |= DMU_PARTIAL_FIRST;
else
dbuf_flags |= DB_RF_PARTIAL_MORE;
dbuf_flags |= DMU_PARTIAL_MORE;
}
(void) dbuf_read(db, zio, dbuf_flags);
if (db->db_state != DB_CACHED)
@ -621,8 +596,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
if (!read && ((flags & DMU_DIRECTIO) == 0))
zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
if (zs)
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
if (zs) {
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE,
(flags & DMU_UNCACHEDIO));
}
rw_exit(&dn->dn_struct_rwlock);
if (read) {
@ -1170,7 +1147,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
static int
dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
void *buf, uint32_t flags)
void *buf, dmu_flags_t flags)
{
dmu_buf_t **dbp;
int numbufs, err = 0;
@ -1198,6 +1175,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
abd_free(data);
return (err);
}
flags &= ~DMU_DIRECTIO;
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
@ -1236,7 +1214,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
int
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags)
void *buf, dmu_flags_t flags)
{
dnode_t *dn;
int err;
@ -1252,14 +1230,14 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
int
dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags)
dmu_flags_t flags)
{
return (dmu_read_impl(dn, offset, size, buf, flags));
}
static void
dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
{
int i;
@ -1275,10 +1253,17 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx, B_FALSE);
else
dmu_buf_will_dirty(db, tx);
if (tocpy == db->db_size) {
dmu_buf_will_fill_flags(db, tx, B_FALSE, flags);
} else {
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
if (bufoff == 0)
flags |= DMU_PARTIAL_FIRST;
else
flags |= DMU_PARTIAL_MORE;
}
dmu_buf_will_dirty_flags(db, tx, flags);
}
ASSERT(db->db_data != NULL);
(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
@ -1304,17 +1289,13 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
VERIFY0(dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
dmu_write_impl(dbp, numbufs, offset, size, buf, tx, DMU_READ_PREFETCH);
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
/*
* This interface is not used internally by ZFS but is provided for
* use by Lustre which is built on the DMU interfaces.
*/
int
dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, uint32_t flags)
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
{
dmu_buf_t **dbp;
int numbufs;
@ -1327,25 +1308,19 @@ dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
abd_t *data = abd_get_from_buf((void *)buf, size);
error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
error = dmu_write_abd(dn, offset, size, data, flags, tx);
abd_free(data);
return (error);
}
flags &= ~DMU_DIRECTIO;
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
FALSE, FTAG, &numbufs, &dbp, flags));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx, flags);
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (0);
}
int
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
}
void
dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx)
@ -1402,20 +1377,22 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
#ifdef _KERNEL
int
dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{
dmu_buf_t **dbp;
int numbufs, i, err;
if (uio->uio_extflg & UIO_DIRECT)
return (dmu_read_uio_direct(dn, uio, size));
return (dmu_read_uio_direct(dn, uio, size, flags));
flags &= ~DMU_DIRECTIO;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
TRUE, FTAG, &numbufs, &dbp, 0);
TRUE, FTAG, &numbufs, &dbp, flags);
if (err)
return (err);
@ -1453,7 +1430,8 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
* because we don't have to find the dnode_t for the object.
*/
int
dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
int err;
@ -1462,7 +1440,7 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
return (0);
DB_DNODE_ENTER(db);
err = dmu_read_uio_dnode(DB_DNODE(db), uio, size);
err = dmu_read_uio_dnode(DB_DNODE(db), uio, size, flags);
DB_DNODE_EXIT(db);
return (err);
@ -1474,7 +1452,8 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
* Starting at offset zfs_uio_offset(uio).
*/
int
dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{
dnode_t *dn;
int err;
@ -1486,7 +1465,7 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
if (err)
return (err);
err = dmu_read_uio_dnode(dn, uio, size);
err = dmu_read_uio_dnode(dn, uio, size, flags);
dnode_rele(dn, FTAG);
@ -1494,12 +1473,14 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
}
int
dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx,
dmu_flags_t flags)
{
dmu_buf_t **dbp;
int numbufs;
int err = 0;
uint64_t write_size;
dmu_flags_t oflags = flags;
top:
write_size = size;
@ -1512,13 +1493,14 @@ top:
(write_size >= dn->dn_datablksz)) {
if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
dn->dn_datablksz)) {
return (dmu_write_uio_direct(dn, uio, size, tx));
return (dmu_write_uio_direct(dn, uio, size, flags, tx));
} else if (write_size > dn->dn_datablksz &&
zfs_dio_offset_aligned(zfs_uio_offset(uio),
dn->dn_datablksz)) {
write_size =
dn->dn_datablksz * (write_size / dn->dn_datablksz);
err = dmu_write_uio_direct(dn, uio, write_size, tx);
err = dmu_write_uio_direct(dn, uio, write_size, flags,
tx);
if (err == 0) {
size -= write_size;
goto top;
@ -1530,9 +1512,10 @@ top:
P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
}
}
flags &= ~DMU_DIRECTIO;
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
FALSE, FTAG, &numbufs, &dbp, flags);
if (err)
return (err);
@ -1549,10 +1532,17 @@ top:
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx, B_TRUE);
else
dmu_buf_will_dirty(db, tx);
if (tocpy == db->db_size) {
dmu_buf_will_fill_flags(db, tx, B_TRUE, flags);
} else {
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
if (bufoff == 0)
flags |= DMU_PARTIAL_FIRST;
else
flags |= DMU_PARTIAL_MORE;
}
dmu_buf_will_dirty_flags(db, tx, flags);
}
ASSERT(db->db_data != NULL);
err = zfs_uio_fault_move((char *)db->db_data + bufoff,
@ -1575,6 +1565,7 @@ top:
dmu_buf_rele_array(dbp, numbufs, FTAG);
if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
flags = oflags;
goto top;
}
@ -1592,7 +1583,7 @@ top:
*/
int
dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx)
dmu_tx_t *tx, dmu_flags_t flags)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
int err;
@ -1601,7 +1592,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
return (0);
DB_DNODE_ENTER(db);
err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx);
err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx, flags);
DB_DNODE_EXIT(db);
return (err);
@ -1614,7 +1605,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
*/
int
dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx)
dmu_tx_t *tx, dmu_flags_t flags)
{
dnode_t *dn;
int err;
@ -1626,7 +1617,7 @@ dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
if (err)
return (err);
err = dmu_write_uio_dnode(dn, uio, size, tx);
err = dmu_write_uio_dnode(dn, uio, size, tx, flags);
dnode_rele(dn, FTAG);
@ -1796,11 +1787,10 @@ dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
*/
int
dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
dmu_tx_t *tx, dmu_flags_t flags)
{
dmu_buf_impl_t *db;
objset_t *os = dn->dn_objset;
uint64_t object = dn->dn_object;
uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
@ -1816,8 +1806,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
* same size as the dbuf.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
zfs_racct_write(os->os_spa, blksz, 1, 0);
dbuf_assign_arcbuf(db, buf, tx);
zfs_racct_write(os->os_spa, blksz, 1, flags);
dbuf_assign_arcbuf(db, buf, tx, flags);
dbuf_rele(db, FTAG);
} else {
/* compressed bufs must always be assignable to their dbuf */
@ -1825,7 +1815,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_write_by_dnode(dn, offset, blksz, buf->b_data, tx, flags);
dmu_return_arcbuf(buf);
}
@ -1834,13 +1824,13 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
int
dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
dmu_tx_t *tx, dmu_flags_t flags)
{
int err;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
DB_DNODE_ENTER(db);
err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx);
err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx, flags);
DB_DNODE_EXIT(db);
return (err);
@ -1985,7 +1975,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
int error;
error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
DB_RF_CANFAIL | DB_RF_NOPREFETCH);
DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
if (error != 0)
return (error);
@ -2928,7 +2918,6 @@ EXPORT_SYMBOL(dmu_read_uio_dbuf);
EXPORT_SYMBOL(dmu_read_uio_dnode);
EXPORT_SYMBOL(dmu_write);
EXPORT_SYMBOL(dmu_write_by_dnode);
EXPORT_SYMBOL(dmu_write_by_dnode_flags);
EXPORT_SYMBOL(dmu_write_uio);
EXPORT_SYMBOL(dmu_write_uio_dbuf);
EXPORT_SYMBOL(dmu_write_uio_dnode);

View File

@ -208,7 +208,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
int
dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
abd_t *data, uint32_t flags, dmu_tx_t *tx)
abd_t *data, dmu_flags_t flags, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
spa_t *spa = dn->dn_objset->os_spa;
@ -247,7 +247,7 @@ dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
int
dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
abd_t *data, uint32_t flags)
abd_t *data, dmu_flags_t flags)
{
objset_t *os = dn->dn_objset;
spa_t *spa = os->os_spa;
@ -351,7 +351,8 @@ error:
#ifdef _KERNEL
int
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{
offset_t offset = zfs_uio_offset(uio);
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
@ -362,7 +363,7 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
offset & (PAGESIZE - 1), size);
err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
err = dmu_read_abd(dn, offset, size, data, flags);
abd_free(data);
if (err == 0)
@ -372,7 +373,8 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
}
int
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags, dmu_tx_t *tx)
{
offset_t offset = zfs_uio_offset(uio);
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
@ -383,7 +385,7 @@ dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
offset & (PAGESIZE - 1), size);
err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
err = dmu_write_abd(dn, offset, size, data, flags, tx);
abd_free(data);
if (err == 0)

View File

@ -2332,12 +2332,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
data = DN_BONUS(dn->dn_phys);
}
} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
int rf = 0;
dmu_flags_t rf = DB_RF_MUST_SUCCEED;
if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
error = dmu_spill_hold_by_dnode(dn,
rf | DB_RF_MUST_SUCCEED,
error = dmu_spill_hold_by_dnode(dn, rf,
FTAG, (dmu_buf_t **)&db);
ASSERT(error == 0);
mutex_enter(&db->db_mtx);

View File

@ -2135,7 +2135,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (data != NULL) {
dmu_buf_t *db;
dnode_t *dn;
uint32_t flags = DMU_READ_NO_PREFETCH;
dmu_flags_t flags = DMU_READ_NO_PREFETCH;
if (rwa->raw)
flags |= DMU_READ_NO_DECRYPT;
@ -2277,14 +2277,18 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
dmu_write_by_dnode(dn,
drrw->drr_offset,
drrw->drr_logical_size,
abd_to_buf(decomp_abd), tx);
abd_to_buf(decomp_abd), tx,
DMU_READ_NO_PREFETCH |
DMU_UNCACHEDIO);
}
abd_free(decomp_abd);
} else {
dmu_write_by_dnode(dn,
drrw->drr_offset,
drrw->drr_logical_size,
abd_to_buf(abd), tx);
abd_to_buf(abd), tx,
DMU_READ_NO_PREFETCH |
DMU_UNCACHEDIO);
}
if (err == 0)
abd_free(abd);
@ -2407,10 +2411,10 @@ receive_process_write_record(struct receive_writer_arg *rwa,
if (rwa->heal) {
blkptr_t *bp;
dmu_buf_t *dbp;
int flags = DB_RF_CANFAIL;
dmu_flags_t flags = DB_RF_CANFAIL;
if (rwa->raw)
flags |= DB_RF_NO_DECRYPT;
flags |= DMU_READ_NO_DECRYPT;
if (rwa->byteswap) {
dmu_object_byteswap_t byteswap =
@ -2567,8 +2571,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
rwa->max_object = drrs->drr_object;
VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
&db_spill)) != 0) {
if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT |
DB_RF_CANFAIL, FTAG, &db_spill)) != 0) {
dmu_buf_rele(db, FTAG);
return (err);
}
@ -2621,7 +2625,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
abd_free(abd);
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx,
DMU_UNCACHEDIO);
dmu_buf_rele(db, FTAG);
dmu_buf_rele(db_spill, FTAG);

View File

@ -297,7 +297,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (BP_GET_LEVEL(bp) > 0) {
uint32_t flags = ARC_FLAG_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
int32_t i, ptidx, pidx;
uint32_t prefetchlimit;
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@ -364,8 +364,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
kmem_free(czb, sizeof (zbookmark_phys_t));
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_FLAG_WAIT;
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t flags = ARC_FLAG_WAIT;
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
dnode_phys_t *child_dnp;
@ -397,7 +397,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
@ -669,7 +669,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
uint32_t flags = ARC_FLAG_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;
ASSERT(!BP_IS_REDACTED(rootbp));

View File

@ -222,8 +222,8 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
* PARTIAL_FIRST allows caching for uncacheable blocks. It will
* be cleared after dmu_buf_will_dirty() call dbuf_read() again.
*/
err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH |
(level == 0 ? DB_RF_PARTIAL_FIRST : 0));
err = dbuf_read(db, zio, DB_RF_CANFAIL | DMU_READ_NO_PREFETCH |
(level == 0 ? (DMU_UNCACHEDIO | DMU_PARTIAL_FIRST) : 0));
dbuf_rele(db, FTAG);
return (err);
}

View File

@ -690,7 +690,7 @@ prescient:
void
dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
boolean_t have_lock)
boolean_t have_lock, boolean_t uncached)
{
int64_t pf_start, pf_end, ipf_start, ipf_end;
int epbs, issued;
@ -745,7 +745,8 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
issued = 0;
for (int64_t blk = pf_start; blk < pf_end; blk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
ZIO_PRIORITY_ASYNC_READ, uncached ?
ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs);
}
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
@ -761,13 +762,13 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
boolean_t missed, boolean_t have_lock)
boolean_t missed, boolean_t have_lock, boolean_t uncached)
{
zstream_t *zs;
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
if (zs)
dmu_zfetch_run(zf, zs, missed, have_lock);
dmu_zfetch_run(zf, zs, missed, have_lock, uncached);
}
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,

View File

@ -1510,7 +1510,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* if we get the encrypted or decrypted version.
*/
err = dbuf_read(db, NULL, DB_RF_CANFAIL |
DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
if (err) {
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
@ -2578,7 +2578,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
}
error = dbuf_read(db, NULL,
DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
if (error) {
dbuf_rele(db, FTAG);
return (error);

View File

@ -513,6 +513,7 @@ dnode_evict_dbufs(dnode_t *dn)
avl_remove(&dn->dn_dbufs, db_marker);
} else {
db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db);
}

View File

@ -703,8 +703,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
boolean_t dummy;
if (hdl->sa_spill == NULL) {
VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
&hdl->sa_spill) == 0);
VERIFY0(dmu_spill_hold_by_bonus(hdl->sa_bonus,
DB_RF_MUST_SUCCEED, NULL, &hdl->sa_spill));
}
dmu_buf_will_dirty(hdl->sa_spill, tx);

View File

@ -948,7 +948,8 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
}
void
spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags)
{
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
kstat_t *ksp = shk->kstat;
@ -967,7 +968,8 @@ spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
}
void
spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags)
{
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
kstat_t *ksp = shk->kstat;

View File

@ -669,7 +669,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
int err;
DB_DNODE_ENTER(db);
err = dmu_read_by_dnode(DB_DNODE(db), off, len,
&lr->lr_data[0], DMU_READ_NO_PREFETCH);
&lr->lr_data[0], DMU_READ_NO_PREFETCH |
DMU_KEEP_CACHING);
DB_DNODE_EXIT(db);
if (err != 0) {
zil_itx_destroy(itx);

View File

@ -89,6 +89,12 @@ static int zfs_dio_enabled = 0;
static int zfs_dio_enabled = 1;
#endif
/*
* Strictly enforce alignment for Direct I/O requests, returning EINVAL
* if not page-aligned instead of silently falling back to uncached I/O.
*/
static int zfs_dio_strict = 0;
/*
* Maximum bytes to read per chunk in zfs_read().
@ -243,46 +249,54 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
int ioflag = *ioflagp;
int error = 0;
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
zn_has_cached_data(zp, zfs_uio_offset(uio),
if (os->os_direct == ZFS_DIRECT_ALWAYS) {
/* Force either direct or uncached I/O. */
ioflag |= O_DIRECT;
}
if ((ioflag & O_DIRECT) == 0)
goto out;
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) {
/*
* Direct I/O is disabled. The I/O request will be directed
* through the ARC as uncached I/O.
*/
goto out;
}
if (!zfs_uio_page_aligned(uio) ||
!zfs_uio_aligned(uio, PAGE_SIZE)) {
/*
* Misaligned requests can be executed through the ARC as
* uncached I/O. But if O_DIRECT was set by user and we
* were set to be strict, then it is a failure.
*/
if ((*ioflagp & O_DIRECT) && zfs_dio_strict)
error = SET_ERROR(EINVAL);
goto out;
}
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
/*
* Direct I/O is disabled or the region is mmap'ed. In either
* case the I/O request will just directed through the ARC.
* The region is mmap'ed. The I/O request will be directed
* through the ARC as uncached I/O.
*/
ioflag &= ~O_DIRECT;
goto out;
} else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
zfs_uio_page_aligned(uio) &&
zfs_uio_aligned(uio, PAGE_SIZE)) {
if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
(rw == UIO_READ)) {
ioflag |= O_DIRECT;
}
} else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
/*
* Direct I/O was requested through the direct=always, but it
* is not properly PAGE_SIZE aligned. The request will be
* directed through the ARC.
*/
ioflag &= ~O_DIRECT;
}
if (ioflag & O_DIRECT) {
if (!zfs_uio_page_aligned(uio) ||
!zfs_uio_aligned(uio, PAGE_SIZE)) {
error = SET_ERROR(EINVAL);
goto out;
}
/*
* For short writes the page mapping of Direct I/O makes no sense.
* Direct them through the ARC as uncached I/O.
*/
if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz)
goto out;
error = zfs_uio_get_dio_pages_alloc(uio, rw);
if (error) {
goto out;
}
}
IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
ASSERT0(error);
error = zfs_uio_get_dio_pages_alloc(uio, rw);
if (error)
goto out;
ASSERT(uio->uio_extflg & UIO_DIRECT);
out:
*ioflagp = ioflag;
@ -392,6 +406,9 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
ssize_t start_resid = n;
ssize_t dio_remaining_resid = 0;
dmu_flags_t dflags = DMU_READ_PREFETCH;
if (ioflag & O_DIRECT)
dflags |= DMU_UNCACHEDIO;
if (uio->uio_extflg & UIO_DIRECT) {
/*
* All pages for an O_DIRECT request ahve already been mapped
@ -414,6 +431,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
if (dio_remaining_resid != 0)
n -= dio_remaining_resid;
dflags |= DMU_DIRECTIO;
}
while (n > 0) {
@ -429,7 +447,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = mappedread(zp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes);
uio, nbytes, dflags);
}
if (error) {
@ -479,15 +497,17 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* remainder of the file can be read using the ARC.
*/
uio->uio_extflg &= ~UIO_DIRECT;
dflags &= ~DMU_DIRECTIO;
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
error = mappedread(zp, dio_remaining_resid, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
dio_remaining_resid);
dio_remaining_resid, dflags);
}
uio->uio_extflg |= UIO_DIRECT;
dflags |= DMU_DIRECTIO;
if (error != 0)
n += dio_remaining_resid;
@ -859,12 +879,18 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_rangelock_reduce(lr, woff, n);
}
dmu_flags_t dflags = DMU_READ_PREFETCH;
if (ioflag & O_DIRECT)
dflags |= DMU_UNCACHEDIO;
if (uio->uio_extflg & UIO_DIRECT)
dflags |= DMU_DIRECTIO;
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
zfs_uio_fault_disable(uio, B_TRUE);
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx);
uio, nbytes, tx, dflags);
zfs_uio_fault_disable(uio, B_FALSE);
#ifdef __linux__
if (error == EFAULT) {
@ -903,7 +929,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* arc buffer to a dbuf.
*/
error = dmu_assign_arcbuf_by_dbuf(
sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags);
if (error != 0) {
/*
* XXX This might not be necessary if
@ -1329,7 +1355,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
@ -2019,3 +2045,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
"Enable Direct I/O");
ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW,
"Return errors on misaligned Direct I/O");

View File

@ -900,8 +900,9 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
(wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
if (wr_state == WR_COPIED &&
dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1,
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
@ -994,7 +995,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
size, RL_READER);
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
} else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/*

View File

@ -107,6 +107,7 @@ VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
DIO_ENABLED dio_enabled zfs_dio_enabled
DIO_STRICT dio_strict zfs_dio_strict
XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max

View File

@ -40,8 +40,10 @@
verify_runnable "global"
log_must save_tunable DIO_STRICT
function cleanup
{
restore_tunable DIO_STRICT
zfs set recordsize=$rs $TESTPOOL/$TESTFS
zfs set direct=standard $TESTPOOL/$TESTFS
log_must rm -f $tmp_file
@ -61,6 +63,13 @@ file_size=$((rs * 8))
log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1
log_must set_tunable32 DIO_STRICT 0
log_must zfs set direct=standard $TESTPOOL/$TESTFS
# sub-pagesize direct writes/read will always pass if not strict.
log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D
log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d
log_must set_tunable32 DIO_STRICT 1
log_must zfs set direct=standard $TESTPOOL/$TESTFS
# sub-pagesize direct writes/read will always fail if direct=standard.
log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D

View File

@ -48,6 +48,7 @@ TESTDS=${TESTPOOL}/${TESTFS}
TESTFILE=${TESTDIR}/${TESTFILE0}
log_must save_tunable DIO_ENABLED
log_must save_tunable DIO_STRICT
typeset recordsize_saved=$(get_prop recordsize $TESTDS)
typeset direct_saved=$(get_prop direct $TESTDS)
@ -57,6 +58,7 @@ function cleanup
zfs set recordsize=$recordsize_saved $TESTDS
zfs set direct=$direct_saved $TESTDS
restore_tunable DIO_ENABLED
restore_tunable DIO_STRICT
}
log_onexit cleanup
@ -154,6 +156,7 @@ for krs in 4 8 16 32 64 128 256 512 ; do
done
# reset for write tests
log_must set_tunable32 DIO_STRICT 1
log_must zfs set recordsize=16K $TESTDS
log_must zfs set direct=standard $TESTDS
@ -173,4 +176,12 @@ log_must zpool sync
assert_dioalign $TESTFILE $PAGE_SIZE 16384
log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
# same again, but without strict, which should succeed.
log_must set_tunable32 DIO_STRICT 0
log_must rm -f $TESTFILE
log_must touch $TESTFILE
log_must zpool sync
assert_dioalign $TESTFILE $PAGE_SIZE 16384
log_must dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
log_pass $CLAIM