Wire O_DIRECT also to Uncached I/O (#17218)

Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O.  It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable.  Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.

While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations.  It require I/Os
to be page aligned, does not allow speculative prefetch, etc.  The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O.  As such it should fill the gap in between.  Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.

To pass the information between the layers I had to change a number
of APIs.  But as side effect upper layers can now control not only
the caching, but also speculative prefetch.  I haven't wired it to
VFS yet, since it require looking on some OS specifics.  But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Alexander Motin 2025-05-13 17:26:55 -04:00 committed by GitHub
parent e2ba0f7643
commit 734eba251d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
35 changed files with 397 additions and 294 deletions

View File

@ -1993,7 +1993,8 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
if (write_state == WR_COPIED && if (write_state == WR_COPIED &&
dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH |
DMU_KEEP_CACHING) != 0) {
zil_itx_destroy(itx); zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr)); itx = zil_itx_create(TX_WRITE, sizeof (*lr));
write_state = WR_NEED_COPY; write_state = WR_NEED_COPY;
@ -2265,19 +2266,19 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
ASSERT(doi.doi_data_block_size); ASSERT(doi.doi_data_block_size);
ASSERT0(offset % doi.doi_data_block_size); ASSERT0(offset % doi.doi_data_block_size);
if (ztest_random(4) != 0) { if (ztest_random(4) != 0) {
int prefetch = ztest_random(2) ? dmu_flags_t flags = ztest_random(2) ?
DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
/* /*
* We will randomly set when to do O_DIRECT on a read. * We will randomly set when to do O_DIRECT on a read.
*/ */
if (ztest_random(4) == 0) if (ztest_random(4) == 0)
prefetch |= DMU_DIRECTIO; flags |= DMU_DIRECTIO;
ztest_block_tag_t rbt; ztest_block_tag_t rbt;
VERIFY(dmu_read(os, lr->lr_foid, offset, VERIFY(dmu_read(os, lr->lr_foid, offset,
sizeof (rbt), &rbt, prefetch) == 0); sizeof (rbt), &rbt, flags) == 0);
if (rbt.bt_magic == BT_MAGIC) { if (rbt.bt_magic == BT_MAGIC) {
ztest_bt_verify(&rbt, os, lr->lr_foid, 0, ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
offset, gen, txg, crtxg); offset, gen, txg, crtxg);
@ -2308,7 +2309,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_write(os, lr->lr_foid, offset, length, data, tx); dmu_write(os, lr->lr_foid, offset, length, data, tx);
} else { } else {
memcpy(abuf->b_data, data, length); memcpy(abuf->b_data, data, length);
VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx)); VERIFY0(dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx, 0));
} }
(void) ztest_log_write(zd, tx, lr); (void) ztest_log_write(zd, tx, lr);
@ -2533,7 +2534,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER); object, offset, size, ZTRL_READER);
error = dmu_read(os, object, offset, size, buf, error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
ASSERT0(error); ASSERT0(error);
} else { } else {
ASSERT3P(zio, !=, NULL); ASSERT3P(zio, !=, NULL);
@ -2549,7 +2550,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
object, offset, size, ZTRL_READER); object, offset, size, ZTRL_READER);
error = dmu_buf_hold_noread(os, object, offset, zgd, &db); error = dmu_buf_hold_noread(os, object, offset, zgd, &db);
if (error == 0) { if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr; blkptr_t *bp = &lr->lr_blkptr;
@ -2826,7 +2826,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
enum ztest_io_type io_type; enum ztest_io_type io_type;
uint64_t blocksize; uint64_t blocksize;
void *data; void *data;
uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH; dmu_flags_t dmu_read_flags = DMU_READ_NO_PREFETCH;
/* /*
* We will randomly set when to do O_DIRECT on a read. * We will randomly set when to do O_DIRECT on a read.
@ -5065,7 +5065,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
uint64_t stride = 123456789ULL; uint64_t stride = 123456789ULL;
uint64_t width = 40; uint64_t width = 40;
int free_percent = 5; int free_percent = 5;
uint32_t dmu_read_flags = DMU_READ_PREFETCH; dmu_flags_t dmu_read_flags = DMU_READ_PREFETCH;
/* /*
* We will randomly set when to do O_DIRECT on a read. * We will randomly set when to do O_DIRECT on a read.
@ -5541,13 +5541,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
} }
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[j], tx)); off, bigbuf_arcbufs[j], tx, 0));
} else { } else {
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off, bigbuf_arcbufs[2 * j], tx)); off, bigbuf_arcbufs[2 * j], tx, 0));
VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
off + chunksize / 2, off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx)); bigbuf_arcbufs[2 * j + 1], tx, 0));
} }
if (i == 1) { if (i == 1) {
dmu_buf_rele(dbt, FTAG); dmu_buf_rele(dbt, FTAG);

View File

@ -45,20 +45,6 @@ extern "C" {
#define IN_DMU_SYNC 2 #define IN_DMU_SYNC 2
/*
* define flags for dbuf_read
*/
#define DB_RF_MUST_SUCCEED (1 << 0)
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
#define DB_RF_NO_DECRYPT (1 << 6)
#define DB_RF_PARTIAL_FIRST (1 << 7)
#define DB_RF_PARTIAL_MORE (1 << 8)
/* /*
* The simplified state transition diagram for dbufs looks like: * The simplified state transition diagram for dbufs looks like:
* *
@ -389,12 +375,15 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag,
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid, uint64_t *hash_out); uint64_t blkid, uint64_t *hash_out);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, dmu_flags_t flags);
void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
void dmu_buf_will_fill_flags(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags);
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
dmu_tx_t *tx); dmu_tx_t *tx);
@ -475,10 +464,10 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
#define DBUF_GET_BUFC_TYPE(_db) \ #define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
#define DBUF_IS_CACHEABLE(_db) \ #define DBUF_IS_CACHEABLE(_db) (!(_db)->db_pending_evict && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \
(dbuf_is_metadata(_db) && \ (dbuf_is_metadata(_db) && \
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))))
boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp); boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp);

View File

@ -532,6 +532,26 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp); struct zio_prop *zp);
/*
* DB_RF_* are to be used for dbuf_read() or in limited other cases.
*/
typedef enum dmu_flags {
DB_RF_MUST_SUCCEED = 0, /* Suspend on I/O errors. */
DB_RF_CANFAIL = 1 << 0, /* Return on I/O errors. */
DB_RF_HAVESTRUCT = 1 << 1, /* dn_struct_rwlock is locked. */
DB_RF_NEVERWAIT = 1 << 2,
DMU_READ_PREFETCH = 0, /* Try speculative prefetch. */
DMU_READ_NO_PREFETCH = 1 << 3, /* Don't prefetch speculatively. */
DB_RF_NOPREFETCH = DMU_READ_NO_PREFETCH,
DMU_READ_NO_DECRYPT = 1 << 4, /* Don't decrypt. */
DB_RF_NO_DECRYPT = DMU_READ_NO_DECRYPT,
DMU_DIRECTIO = 1 << 5, /* Bypass ARC. */
DMU_UNCACHEDIO = 1 << 6, /* Reduce caching. */
DMU_PARTIAL_FIRST = 1 << 7, /* First partial access. */
DMU_PARTIAL_MORE = 1 << 8, /* Following partial access. */
DMU_KEEP_CACHING = 1 << 9, /* Don't affect caching. */
} dmu_flags_t;
/* /*
* The bonus data is accessed more or less like a regular buffer. * The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a * You must dmu_bonus_hold() to get the buffer, which will give you a
@ -547,7 +567,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
dmu_buf_t **dbp); dmu_buf_t **dbp);
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags); dmu_flags_t flags);
int dmu_bonus_max(void); int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
@ -558,9 +578,9 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
* Special spill buffer support used by "SA" framework * Special spill buffer support used by "SA" framework
*/ */
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags,
dmu_buf_t **dbp); const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags,
const void *tag, dmu_buf_t **dbp); const void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
@ -579,17 +599,17 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
* The object number must be a valid, allocated object number. * The object number must be a valid, allocated object number.
*/ */
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **, int flags); const void *tag, dmu_buf_t **, dmu_flags_t flags);
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp, uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp); dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp); const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags); const void *tag, dmu_buf_t **dbp, dmu_flags_t flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp, uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags); dmu_buf_t ***dbpp, dmu_flags_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp); dmu_buf_t **dbp);
@ -781,6 +801,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)). * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
*/ */
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags);
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
@ -874,40 +895,36 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
* Canfail routines will return 0 on success, or an errno if there is a * Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error. * nonrecoverable I/O error.
*/ */
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
#define DMU_DIRECTIO 4 /* use Direct I/O */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags); void *buf, dmu_flags_t flags);
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags); dmu_flags_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx); const void *buf, dmu_tx_t *tx);
int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx); const void *buf, dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, uint32_t flags);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx); dmu_tx_t *tx);
#ifdef _KERNEL #ifdef _KERNEL
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); dmu_flags_t flags);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags);
int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx); dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx); dmu_tx_t *tx, dmu_flags_t flags);
int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx); dmu_tx_t *tx, dmu_flags_t flags);
#endif #endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf); void dmu_return_arcbuf(struct arc_buf *buf);
int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx); struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
struct arc_buf *buf, dmu_tx_t *tx); struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf
extern uint_t zfs_max_recordsize; extern uint_t zfs_max_recordsize;

View File

@ -270,11 +270,13 @@ void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *); int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *);
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags); int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t);
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *); int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t,
dmu_tx_t *);
#if defined(_KERNEL) #if defined(_KERNEL)
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t); int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t);
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *); int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t,
dmu_tx_t *);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -81,9 +81,10 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *); void dmu_zfetch_fini(zfetch_t *);
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t); boolean_t);
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t); void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t,
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t); boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t, boolean_t);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -981,9 +981,9 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_skipped, uint64_t bytes_skipped,
uint64_t extents_failed, uint64_t bytes_failed); uint64_t extents_failed, uint64_t bytes_failed);
extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags); dmu_flags_t flags);
extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
uint32_t flags); dmu_flags_t flags);
extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_add(spa_t *spa);
extern void spa_import_progress_remove(uint64_t spa_guid); extern void spa_import_progress_remove(uint64_t spa_guid);
extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,

View File

@ -33,7 +33,9 @@
/* /*
* Platform-dependent resource accounting hooks * Platform-dependent resource accounting hooks
*/ */
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops,
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); dmu_flags_t flags);
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags);
#endif /* _SYS_ZFS_RACCT_H */ #endif /* _SYS_ZFS_RACCT_H */

View File

@ -27,13 +27,13 @@
#include <sys/zfs_racct.h> #include <sys/zfs_racct.h>
void void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
(void) spa, (void) size, (void) iops, (void) flags; (void) spa, (void) size, (void) iops, (void) flags;
} }
void void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
(void) spa, (void) size, (void) iops, (void) flags; (void) spa, (void) size, (void) iops, (void) flags;
} }

View File

@ -304,7 +304,7 @@ Default dnode block size as a power of 2.
.It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int
Default dnode indirect block size as a power of 2. Default dnode indirect block size as a power of 2.
. .
.It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int .It Sy zfs_dio_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Enable Direct I/O. Enable Direct I/O.
If this setting is 0, then all I/O requests will be directed through the ARC If this setting is 0, then all I/O requests will be directed through the ARC
acting as though the dataset property acting as though the dataset property
@ -312,6 +312,11 @@ acting as though the dataset property
was set to was set to
.Sy disabled . .Sy disabled .
. .
.It Sy zfs_dio_strict Ns = Ns Sy 0 Ns | Ns 1 Pq int
Strictly enforce alignment for Direct I/O requests, returning
.Sy EINVAL
if not page-aligned instead of silently falling back to uncached I/O.
.
.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
When attempting to log an output nvlist of an ioctl in the on-disk history, When attempting to log an output nvlist of an ioctl in the on-disk history,
the output will not be stored if it is larger than this size (in bytes). the output will not be stored if it is larger than this size (in bytes).

View File

@ -41,7 +41,6 @@
#include <sys/dsl_pool.h> #include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h> #include <sys/dsl_synctask.h>
#include <sys/dsl_prop.h> #include <sys/dsl_prop.h>
#include <sys/dmu_zfetch.h>
#include <sys/zfs_ioctl.h> #include <sys/zfs_ioctl.h>
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/zio_checksum.h> #include <sys/zio_checksum.h>
@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
struct sf_buf *sf; struct sf_buf *sf;
int numbufs, i; int numbufs, i;
int err; int err;
dmu_flags_t flags = 0;
if (size == 0) if (size == 0)
return (0); return (0);
@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size) if (tocpy == db->db_size) {
dmu_buf_will_fill(db, tx, B_FALSE); dmu_buf_will_fill(db, tx, B_FALSE);
else } else {
dmu_buf_will_dirty(db, tx); if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
if (bufoff == 0)
flags |= DMU_PARTIAL_FIRST;
else
flags |= DMU_PARTIAL_MORE;
}
dmu_buf_will_dirty_flags(db, tx, flags);
}
for (copied = 0; copied < tocpy; copied += PAGESIZE) { for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(ptoa((*ma)->pindex), ==, ASSERT3U(ptoa((*ma)->pindex), ==,

View File

@ -28,7 +28,7 @@
#include <sys/racct.h> #include <sys/racct.h>
void void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
curthread->td_ru.ru_inblock += iops; curthread->td_ru.ru_inblock += iops;
#ifdef RACCT #ifdef RACCT
@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
} }
void void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
curthread->td_ru.ru_oublock += iops; curthread->td_ru.ru_oublock += iops;
#ifdef RACCT #ifdef RACCT

View File

@ -530,7 +530,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
page_unhold(pp); page_unhold(pp);
} else { } else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes); uio, bytes, DMU_READ_PREFETCH);
} }
len -= bytes; len -= bytes;
off = 0; off = 0;

View File

@ -679,7 +679,7 @@ zvol_strategy_impl(zv_request_t *zvr)
while (resid != 0 && off < volsize) { while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys); size_t size = MIN(resid, zvol_maxphys);
if (doread) { if (doread) {
error = dmu_read(os, ZVOL_OBJ, off, size, addr, error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
DMU_READ_PREFETCH); DMU_READ_PREFETCH);
} else { } else {
dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_t *tx = dmu_tx_create(os);
@ -688,7 +688,8 @@ zvol_strategy_impl(zv_request_t *zvr)
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
} else { } else {
dmu_write(os, ZVOL_OBJ, off, size, addr, tx); dmu_write_by_dnode(zv->zv_dn, off, size, addr,
tx, DMU_READ_PREFETCH);
zvol_log_write(zv, tx, off, size, commit); zvol_log_write(zv, tx, off, size, commit);
dmu_tx_commit(tx); dmu_tx_commit(tx);
} }
@ -834,7 +835,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
if (bytes > volsize - zfs_uio_offset(&uio)) if (bytes > volsize - zfs_uio_offset(&uio))
bytes = volsize - zfs_uio_offset(&uio); bytes = volsize - zfs_uio_offset(&uio);
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
DMU_READ_PREFETCH);
if (error) { if (error) {
/* Convert checksum errors into IO errors. */ /* Convert checksum errors into IO errors. */
if (error == ECKSUM) if (error == ECKSUM)
@ -893,7 +895,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
dmu_tx_abort(tx); dmu_tx_abort(tx);
break; break;
} }
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
DMU_READ_PREFETCH);
if (error == 0) if (error == 0)
zvol_log_write(zv, tx, off, bytes, commit); zvol_log_write(zv, tx, off, bytes, commit);
dmu_tx_commit(tx); dmu_tx_commit(tx);

View File

@ -30,14 +30,14 @@
#include <linux/task_io_accounting_ops.h> #include <linux/task_io_accounting_ops.h>
void void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
task_io_account_read(size); task_io_account_read(size);
spa_iostats_read_add(spa, size, iops, flags); spa_iostats_read_add(spa, size, iops, flags);
} }
void void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
task_io_account_write(size); task_io_account_write(size);
spa_iostats_write_add(spa, size, iops, flags); spa_iostats_write_add(spa, size, iops, flags);
@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
#else #else
void void
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
(void) spa, (void) size, (void) iops, (void) flags; (void) spa, (void) size, (void) iops, (void) flags;
} }
void void
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
{ {
(void) spa, (void) size, (void) iops, (void) flags; (void) spa, (void) size, (void) iops, (void) flags;
} }

View File

@ -329,7 +329,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
put_page(pp); put_page(pp);
} else { } else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, bytes); uio, bytes, DMU_READ_PREFETCH);
} }
len -= bytes; len -= bytes;

View File

@ -258,7 +258,8 @@ zvol_write(zv_request_t *zvr)
dmu_tx_abort(tx); dmu_tx_abort(tx);
break; break;
} }
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
DMU_READ_PREFETCH);
if (error == 0) { if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync); zvol_log_write(zv, tx, off, bytes, sync);
} }
@ -428,7 +429,8 @@ zvol_read(zv_request_t *zvr)
if (bytes > volsize - uio.uio_loffset) if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio.uio_loffset; bytes = volsize - uio.uio_loffset;
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
DMU_READ_PREFETCH);
if (error) { if (error) {
/* convert checksum errors into IO errors */ /* convert checksum errors into IO errors */
if (error == ECKSUM) if (error == ECKSUM)

View File

@ -6103,7 +6103,9 @@ top:
ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
metadata, misses); metadata, misses);
zfs_racct_read(spa, size, 1, 0); zfs_racct_read(spa, size, 1,
(*arc_flags & ARC_FLAG_UNCACHED) ?
DMU_UNCACHEDIO : 0);
} }
/* Check if the spa even has l2 configured */ /* Check if the spa even has l2 configured */

View File

@ -1499,7 +1499,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* decrypt / authenticate them when we need to read an encrypted bonus buffer. * decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/ */
static int static int
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,
dmu_flags_t flags)
{ {
objset_t *os = db->db_objset; objset_t *os = db->db_objset;
dmu_buf_impl_t *dndb; dmu_buf_impl_t *dndb;
@ -1507,7 +1508,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
zbookmark_phys_t zb; zbookmark_phys_t zb;
int err; int err;
if ((flags & DB_RF_NO_DECRYPT) != 0 || if ((flags & DMU_READ_NO_DECRYPT) != 0 ||
!os->os_encrypted || os->os_raw_receive || !os->os_encrypted || os->os_raw_receive ||
(dndb = dn->dn_dbuf) == NULL) (dndb = dn->dn_dbuf) == NULL)
return (0); return (0);
@ -1561,7 +1562,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
* returning. * returning.
*/ */
static int static int
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
db_lock_type_t dblt, blkptr_t *bp, const void *tag) db_lock_type_t dblt, blkptr_t *bp, const void *tag)
{ {
zbookmark_phys_t zb; zbookmark_phys_t zb;
@ -1627,7 +1628,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
zio_flags = (flags & DB_RF_CANFAIL) ? zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp)) if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW; zio_flags |= ZIO_FLAG_RAW;
/* /*
@ -1728,7 +1729,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} }
int int
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)
{ {
dnode_t *dn; dnode_t *dn;
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
@ -1748,12 +1749,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
goto done; goto done;
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0; (flags & DMU_READ_NO_PREFETCH) == 0;
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
if (flags & DB_RF_PARTIAL_FIRST) if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
if (flags & DMU_PARTIAL_FIRST)
db->db_partial_read = B_TRUE; db->db_partial_read = B_TRUE;
else if (!(flags & DB_RF_PARTIAL_MORE)) else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))
db->db_partial_read = B_FALSE; db->db_partial_read = B_FALSE;
miss = (db->db_state != DB_CACHED); miss = (db->db_state != DB_CACHED);
@ -1794,7 +1797,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if * unauthenticated blocks, which will verify their MAC if
* the key is now available. * the key is now available.
*/ */
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(arc_is_encrypted(db->db_buf) || (arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) || arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@ -1842,7 +1845,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
if (err == 0 && prefetch) { if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
flags & DB_RF_HAVESTRUCT); flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||
db->db_pending_evict);
} }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
@ -1874,11 +1878,14 @@ done:
} }
static void static void
dbuf_noread(dmu_buf_impl_t *db) dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
{ {
ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
db->db_partial_read = B_FALSE;
while (db->db_state == DB_READ || db->db_state == DB_FILL) while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx); cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) { if (db->db_state == DB_UNCACHED) {
@ -2191,8 +2198,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
kmem_free(dr, sizeof (*dr)); kmem_free(dr, sizeof (*dr));
return (NULL); return (NULL);
} }
int err = dbuf_read(parent_db, NULL, int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |
(DB_RF_NOPREFETCH | DB_RF_CANFAIL)); DMU_READ_NO_PREFETCH);
if (err != 0) { if (err != 0) {
dbuf_rele(parent_db, FTAG); dbuf_rele(parent_db, FTAG);
kmem_free(dr, sizeof (*dr)); kmem_free(dr, sizeof (*dr));
@ -2620,8 +2627,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE); return (B_FALSE);
} }
static void void
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
boolean_t undirty = B_FALSE; boolean_t undirty = B_FALSE;
@ -2673,7 +2680,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* not the uderlying block that is being replaced. dbuf_undirty() will * not the uderlying block that is being replaced. dbuf_undirty() will
* do brt_pending_remove() before removing the dirty record. * do brt_pending_remove() before removing the dirty record.
*/ */
(void) dbuf_read(db, NULL, flags); (void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);
if (undirty) { if (undirty) {
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
VERIFY(!dbuf_undirty(db, tx)); VERIFY(!dbuf_undirty(db, tx));
@ -2685,8 +2692,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
void void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{ {
dmu_buf_will_dirty_impl(db_fake, dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
} }
boolean_t boolean_t
@ -2850,7 +2856,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
DBUF_VERIFY(db); DBUF_VERIFY(db);
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dbuf_noread(db); dbuf_noread(db, DMU_KEEP_CACHING);
(void) dbuf_dirty(db, tx); (void) dbuf_dirty(db, tx);
} }
@ -2864,12 +2870,13 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
DTRACE_SET_STATE(db, "allocating NOFILL buffer"); DTRACE_SET_STATE(db, "allocating NOFILL buffer");
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dbuf_noread(db); dbuf_noread(db, DMU_KEEP_CACHING);
(void) dbuf_dirty(db, tx); (void) dbuf_dirty(db, tx);
} }
void void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
dmu_flags_t flags)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@ -2891,7 +2898,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
*/ */
if (canfail && dr) { if (canfail && dr) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx); dmu_buf_will_dirty_flags(db_fake, tx, flags);
return; return;
} }
/* /*
@ -2907,10 +2914,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
} }
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dbuf_noread(db); dbuf_noread(db, flags);
(void) dbuf_dirty(db, tx); (void) dbuf_dirty(db, tx);
} }
void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);
}
/* /*
* This function is effectively the same as dmu_buf_will_dirty(), but * This function is effectively the same as dmu_buf_will_dirty(), but
* indicates the caller expects raw encrypted data in the db, and provides * indicates the caller expects raw encrypted data in the db, and provides
@ -2933,8 +2946,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
ASSERT0(db->db_level); ASSERT0(db->db_level);
ASSERT(db->db_objset->os_raw_receive); ASSERT(db->db_objset->os_raw_receive);
dmu_buf_will_dirty_impl(db_fake, dmu_buf_will_dirty_flags(db_fake, tx,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
dr = dbuf_find_dirty_eq(db, tx->tx_txg); dr = dbuf_find_dirty_eq(db, tx->tx_txg);
@ -3076,7 +3089,8 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/ */
void void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
dmu_flags_t flags)
{ {
ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@ -3090,6 +3104,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(arc_released(buf)); ASSERT(arc_released(buf));
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
db->db_pending_evict = B_FALSE;
db->db_partial_read = B_FALSE;
while (db->db_state == DB_READ || db->db_state == DB_FILL) while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx); cv_wait(&db->db_changed, &db->db_mtx);
@ -3344,8 +3361,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
if (err) if (err)
return (err); return (err);
err = dbuf_read(*parentp, NULL, err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);
if (err) { if (err) {
dbuf_rele(*parentp, NULL); dbuf_rele(*parentp, NULL);
*parentp = NULL; *parentp = NULL;
@ -3404,7 +3421,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_user = NULL; db->db_user = NULL;
db->db_user_immediate_evict = FALSE; db->db_user_immediate_evict = FALSE;
db->db_freed_in_flight = FALSE; db->db_freed_in_flight = FALSE;
db->db_pending_evict = FALSE; db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
if (blkid == DMU_BONUS_BLKID) { if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf); ASSERT3P(parent, ==, dn->dn_dbuf);
@ -3615,8 +3633,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dbuf_prefetch_fini(dpa, B_TRUE); dbuf_prefetch_fini(dpa, B_TRUE);
return; return;
} }
(void) dbuf_read(db, NULL, (void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); DMU_READ_NO_PREFETCH);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
} }
@ -4002,6 +4020,7 @@ dbuf_create_bonus(dnode_t *dn)
ASSERT(dn->dn_bonus == NULL); ASSERT(dn->dn_bonus == NULL);
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL, dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID)); dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
dn->dn_bonus->db_pending_evict = FALSE;
} }
int int
@ -4167,8 +4186,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
* This dbuf has anonymous data associated with it. * This dbuf has anonymous data associated with it.
*/ */
dbuf_destroy(db); dbuf_destroy(db);
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) || } else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {
db->db_pending_evict) { /*
* We don't expect more accesses to the dbuf, and it
* is either not cacheable or was marked for eviction.
*/
dbuf_destroy(db); dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) { } else if (!multilist_link_active(&db->db_cache_link)) {
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);

View File

@ -222,20 +222,14 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
int int
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags) const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
{ {
int err; int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) { if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags); err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
if (err != 0) { if (err != 0) {
dbuf_rele(db, tag); dbuf_rele(db, tag);
*dbp = NULL; *dbp = NULL;
@ -247,20 +241,14 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
int int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags) const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
{ {
int err; int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp); err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) { if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags); err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
if (err != 0) { if (err != 0) {
dbuf_rele(db, tag); dbuf_rele(db, tag);
*dbp = NULL; *dbp = NULL;
@ -358,16 +346,10 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* Returns ENOENT, EIO, or 0. * Returns ENOENT, EIO, or 0.
*/ */
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags) dmu_flags_t flags)
{ {
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
int error; int error;
uint32_t db_flags = DB_RF_MUST_SUCCEED;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) { if (dn->dn_bonus == NULL) {
@ -393,7 +375,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
*/ */
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
error = dbuf_read(db, NULL, db_flags); error = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
if (error) { if (error) {
dnode_evict_bonus(dn); dnode_evict_bonus(dn);
dbuf_rele(db, tag); dbuf_rele(db, tag);
@ -431,7 +413,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
* dmu_spill_hold_existing() should be used. * dmu_spill_hold_existing() should be used.
*/ */
int int
dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag,
dmu_buf_t **dbp) dmu_buf_t **dbp)
{ {
dmu_buf_impl_t *db = NULL; dmu_buf_impl_t *db = NULL;
@ -489,18 +471,14 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
} }
int int
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, const void *tag,
dmu_buf_t **dbp) dmu_buf_t **dbp)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
int err; int err;
uint32_t db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp); err = dmu_spill_hold_by_dnode(DB_DNODE(db), flags, tag, dbp);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
return (err); return (err);
@ -515,12 +493,12 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
int int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
uint32_t flags) dmu_flags_t flags)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
zstream_t *zs = NULL; zstream_t *zs = NULL;
uint64_t blkid, nblks, i; uint64_t blkid, nblks, i;
uint32_t dbuf_flags; dmu_flags_t dbuf_flags;
int err; int err;
zio_t *zio = NULL; zio_t *zio = NULL;
boolean_t missed = B_FALSE; boolean_t missed = B_FALSE;
@ -532,11 +510,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
* we can tell it about the multi-block read. dbuf_read() only knows * we can tell it about the multi-block read. dbuf_read() only knows
* about the one block it is accessing. * about the one block it is accessing.
*/ */
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | dbuf_flags = (flags & ~DMU_READ_PREFETCH) | DMU_READ_NO_PREFETCH |
DB_RF_NOPREFETCH; DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
if ((flags & DMU_READ_NO_DECRYPT) != 0)
dbuf_flags |= DB_RF_NO_DECRYPT;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) { if (dn->dn_datablkshift) {
@ -569,15 +544,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
* that if multiple threads block on same indirect block, we * that if multiple threads block on same indirect block, we
* base predictions on the original less racy request order. * base predictions on the original less racy request order.
*/ */
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
B_TRUE); read && !(flags & DMU_DIRECTIO), B_TRUE);
} }
for (i = 0; i < nblks; i++) { for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) { if (db == NULL) {
if (zs) { if (zs) {
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
B_TRUE); B_TRUE, (flags & DMU_UNCACHEDIO));
} }
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag); dmu_buf_rele_array(dbp, nblks, tag);
@ -599,9 +574,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
offset + length < db->db.db_offset + offset + length < db->db.db_offset +
db->db.db_size) { db->db.db_size) {
if (offset <= db->db.db_offset) if (offset <= db->db.db_offset)
dbuf_flags |= DB_RF_PARTIAL_FIRST; dbuf_flags |= DMU_PARTIAL_FIRST;
else else
dbuf_flags |= DB_RF_PARTIAL_MORE; dbuf_flags |= DMU_PARTIAL_MORE;
} }
(void) dbuf_read(db, zio, dbuf_flags); (void) dbuf_read(db, zio, dbuf_flags);
if (db->db_state != DB_CACHED) if (db->db_state != DB_CACHED)
@ -621,8 +596,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
if (!read && ((flags & DMU_DIRECTIO) == 0)) if (!read && ((flags & DMU_DIRECTIO) == 0))
zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags); zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
if (zs) if (zs) {
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE,
(flags & DMU_UNCACHEDIO));
}
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (read) { if (read) {
@ -1170,7 +1147,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
static int static int
dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
void *buf, uint32_t flags) void *buf, dmu_flags_t flags)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
int numbufs, err = 0; int numbufs, err = 0;
@ -1198,6 +1175,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
abd_free(data); abd_free(data);
return (err); return (err);
} }
flags &= ~DMU_DIRECTIO;
while (size > 0) { while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
@ -1236,7 +1214,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
int int
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags) void *buf, dmu_flags_t flags)
{ {
dnode_t *dn; dnode_t *dn;
int err; int err;
@ -1252,14 +1230,14 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
int int
dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags) dmu_flags_t flags)
{ {
return (dmu_read_impl(dn, offset, size, buf, flags)); return (dmu_read_impl(dn, offset, size, buf, flags));
} }
static void static void
dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx) const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
{ {
int i; int i;
@ -1275,10 +1253,17 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size) if (tocpy == db->db_size) {
dmu_buf_will_fill(db, tx, B_FALSE); dmu_buf_will_fill_flags(db, tx, B_FALSE, flags);
else } else {
dmu_buf_will_dirty(db, tx); if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
if (bufoff == 0)
flags |= DMU_PARTIAL_FIRST;
else
flags |= DMU_PARTIAL_MORE;
}
dmu_buf_will_dirty_flags(db, tx, flags);
}
ASSERT(db->db_data != NULL); ASSERT(db->db_data != NULL);
(void) memcpy((char *)db->db_data + bufoff, buf, tocpy); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
@ -1304,17 +1289,13 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
VERIFY0(dmu_buf_hold_array(os, object, offset, size, VERIFY0(dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp)); FALSE, FTAG, &numbufs, &dbp));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_write_impl(dbp, numbufs, offset, size, buf, tx, DMU_READ_PREFETCH);
dmu_buf_rele_array(dbp, numbufs, FTAG); dmu_buf_rele_array(dbp, numbufs, FTAG);
} }
/*
* This interface is not used internally by ZFS but is provided for
* use by Lustre which is built on the DMU interfaces.
*/
int int
dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx, uint32_t flags) const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
int numbufs; int numbufs;
@ -1327,25 +1308,19 @@ dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) && if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
zfs_dio_aligned(offset, size, dn->dn_datablksz)) { zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
abd_t *data = abd_get_from_buf((void *)buf, size); abd_t *data = abd_get_from_buf((void *)buf, size);
error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); error = dmu_write_abd(dn, offset, size, data, flags, tx);
abd_free(data); abd_free(data);
return (error); return (error);
} }
flags &= ~DMU_DIRECTIO;
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); FALSE, FTAG, &numbufs, &dbp, flags));
dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_write_impl(dbp, numbufs, offset, size, buf, tx, flags);
dmu_buf_rele_array(dbp, numbufs, FTAG); dmu_buf_rele_array(dbp, numbufs, FTAG);
return (0); return (0);
} }
int
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
}
void void
dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx) dmu_tx_t *tx)
@ -1402,20 +1377,22 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
#ifdef _KERNEL #ifdef _KERNEL
int int
dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
int numbufs, i, err; int numbufs, i, err;
if (uio->uio_extflg & UIO_DIRECT) if (uio->uio_extflg & UIO_DIRECT)
return (dmu_read_uio_direct(dn, uio, size)); return (dmu_read_uio_direct(dn, uio, size, flags));
flags &= ~DMU_DIRECTIO;
/* /*
* NB: we could do this block-at-a-time, but it's nice * NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel. * to be reading in parallel.
*/ */
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
TRUE, FTAG, &numbufs, &dbp, 0); TRUE, FTAG, &numbufs, &dbp, flags);
if (err) if (err)
return (err); return (err);
@ -1453,7 +1430,8 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
* because we don't have to find the dnode_t for the object. * because we don't have to find the dnode_t for the object.
*/ */
int int
dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
int err; int err;
@ -1462,7 +1440,7 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
return (0); return (0);
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
err = dmu_read_uio_dnode(DB_DNODE(db), uio, size); err = dmu_read_uio_dnode(DB_DNODE(db), uio, size, flags);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
return (err); return (err);
@ -1474,7 +1452,8 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
* Starting at offset zfs_uio_offset(uio). * Starting at offset zfs_uio_offset(uio).
*/ */
int int
dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{ {
dnode_t *dn; dnode_t *dn;
int err; int err;
@ -1486,7 +1465,7 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
if (err) if (err)
return (err); return (err);
err = dmu_read_uio_dnode(dn, uio, size); err = dmu_read_uio_dnode(dn, uio, size, flags);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
@ -1494,12 +1473,14 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
} }
int int
dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx,
dmu_flags_t flags)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
int numbufs; int numbufs;
int err = 0; int err = 0;
uint64_t write_size; uint64_t write_size;
dmu_flags_t oflags = flags;
top: top:
write_size = size; write_size = size;
@ -1512,13 +1493,14 @@ top:
(write_size >= dn->dn_datablksz)) { (write_size >= dn->dn_datablksz)) {
if (zfs_dio_aligned(zfs_uio_offset(uio), write_size, if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
dn->dn_datablksz)) { dn->dn_datablksz)) {
return (dmu_write_uio_direct(dn, uio, size, tx)); return (dmu_write_uio_direct(dn, uio, size, flags, tx));
} else if (write_size > dn->dn_datablksz && } else if (write_size > dn->dn_datablksz &&
zfs_dio_offset_aligned(zfs_uio_offset(uio), zfs_dio_offset_aligned(zfs_uio_offset(uio),
dn->dn_datablksz)) { dn->dn_datablksz)) {
write_size = write_size =
dn->dn_datablksz * (write_size / dn->dn_datablksz); dn->dn_datablksz * (write_size / dn->dn_datablksz);
err = dmu_write_uio_direct(dn, uio, write_size, tx); err = dmu_write_uio_direct(dn, uio, write_size, flags,
tx);
if (err == 0) { if (err == 0) {
size -= write_size; size -= write_size;
goto top; goto top;
@ -1530,9 +1512,10 @@ top:
P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz); P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
} }
} }
flags &= ~DMU_DIRECTIO;
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size, err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); FALSE, FTAG, &numbufs, &dbp, flags);
if (err) if (err)
return (err); return (err);
@ -1549,10 +1532,17 @@ top:
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size) if (tocpy == db->db_size) {
dmu_buf_will_fill(db, tx, B_TRUE); dmu_buf_will_fill_flags(db, tx, B_TRUE, flags);
else } else {
dmu_buf_will_dirty(db, tx); if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
if (bufoff == 0)
flags |= DMU_PARTIAL_FIRST;
else
flags |= DMU_PARTIAL_MORE;
}
dmu_buf_will_dirty_flags(db, tx, flags);
}
ASSERT(db->db_data != NULL); ASSERT(db->db_data != NULL);
err = zfs_uio_fault_move((char *)db->db_data + bufoff, err = zfs_uio_fault_move((char *)db->db_data + bufoff,
@ -1575,6 +1565,7 @@ top:
dmu_buf_rele_array(dbp, numbufs, FTAG); dmu_buf_rele_array(dbp, numbufs, FTAG);
if ((uio->uio_extflg & UIO_DIRECT) && size > 0) { if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
flags = oflags;
goto top; goto top;
} }
@ -1592,7 +1583,7 @@ top:
*/ */
int int
dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx) dmu_tx_t *tx, dmu_flags_t flags)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
int err; int err;
@ -1601,7 +1592,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
return (0); return (0);
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx); err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx, flags);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
return (err); return (err);
@ -1614,7 +1605,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
*/ */
int int
dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
dmu_tx_t *tx) dmu_tx_t *tx, dmu_flags_t flags)
{ {
dnode_t *dn; dnode_t *dn;
int err; int err;
@ -1626,7 +1617,7 @@ dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
if (err) if (err)
return (err); return (err);
err = dmu_write_uio_dnode(dn, uio, size, tx); err = dmu_write_uio_dnode(dn, uio, size, tx, flags);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
@ -1796,11 +1787,10 @@ dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
*/ */
int int
dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx) dmu_tx_t *tx, dmu_flags_t flags)
{ {
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
objset_t *os = dn->dn_objset; objset_t *os = dn->dn_objset;
uint64_t object = dn->dn_object;
uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid; uint64_t blkid;
@ -1816,8 +1806,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
* same size as the dbuf. * same size as the dbuf.
*/ */
if (offset == db->db.db_offset && blksz == db->db.db_size) { if (offset == db->db.db_offset && blksz == db->db.db_size) {
zfs_racct_write(os->os_spa, blksz, 1, 0); zfs_racct_write(os->os_spa, blksz, 1, flags);
dbuf_assign_arcbuf(db, buf, tx); dbuf_assign_arcbuf(db, buf, tx, flags);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
} else { } else {
/* compressed bufs must always be assignable to their dbuf */ /* compressed bufs must always be assignable to their dbuf */
@ -1825,7 +1815,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_write_by_dnode(dn, offset, blksz, buf->b_data, tx, flags);
dmu_return_arcbuf(buf); dmu_return_arcbuf(buf);
} }
@ -1834,13 +1824,13 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
int int
dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx) dmu_tx_t *tx, dmu_flags_t flags)
{ {
int err; int err;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx); err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx, flags);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
return (err); return (err);
@ -1985,7 +1975,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
int error; int error;
error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
DB_RF_CANFAIL | DB_RF_NOPREFETCH); DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
if (error != 0) if (error != 0)
return (error); return (error);
@ -2928,7 +2918,6 @@ EXPORT_SYMBOL(dmu_read_uio_dbuf);
EXPORT_SYMBOL(dmu_read_uio_dnode); EXPORT_SYMBOL(dmu_read_uio_dnode);
EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write);
EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_write_by_dnode);
EXPORT_SYMBOL(dmu_write_by_dnode_flags);
EXPORT_SYMBOL(dmu_write_uio); EXPORT_SYMBOL(dmu_write_uio);
EXPORT_SYMBOL(dmu_write_uio_dbuf); EXPORT_SYMBOL(dmu_write_uio_dbuf);
EXPORT_SYMBOL(dmu_write_uio_dnode); EXPORT_SYMBOL(dmu_write_uio_dnode);

View File

@ -208,7 +208,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
int int
dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
abd_t *data, uint32_t flags, dmu_tx_t *tx) abd_t *data, dmu_flags_t flags, dmu_tx_t *tx)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
spa_t *spa = dn->dn_objset->os_spa; spa_t *spa = dn->dn_objset->os_spa;
@ -247,7 +247,7 @@ dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
int int
dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
abd_t *data, uint32_t flags) abd_t *data, dmu_flags_t flags)
{ {
objset_t *os = dn->dn_objset; objset_t *os = dn->dn_objset;
spa_t *spa = os->os_spa; spa_t *spa = os->os_spa;
@ -351,7 +351,8 @@ error:
#ifdef _KERNEL #ifdef _KERNEL
int int
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags)
{ {
offset_t offset = zfs_uio_offset(uio); offset_t offset = zfs_uio_offset(uio);
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
@ -362,7 +363,7 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
offset & (PAGESIZE - 1), size); offset & (PAGESIZE - 1), size);
err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO); err = dmu_read_abd(dn, offset, size, data, flags);
abd_free(data); abd_free(data);
if (err == 0) if (err == 0)
@ -372,7 +373,8 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
} }
int int
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
dmu_flags_t flags, dmu_tx_t *tx)
{ {
offset_t offset = zfs_uio_offset(uio); offset_t offset = zfs_uio_offset(uio);
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
@ -383,7 +385,7 @@ dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
offset & (PAGESIZE - 1), size); offset & (PAGESIZE - 1), size);
err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); err = dmu_write_abd(dn, offset, size, data, flags, tx);
abd_free(data); abd_free(data);
if (err == 0) if (err == 0)

View File

@ -2332,12 +2332,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
data = DN_BONUS(dn->dn_phys); data = DN_BONUS(dn->dn_phys);
} }
} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
int rf = 0; dmu_flags_t rf = DB_RF_MUST_SUCCEED;
if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT; rf |= DB_RF_HAVESTRUCT;
error = dmu_spill_hold_by_dnode(dn, error = dmu_spill_hold_by_dnode(dn, rf,
rf | DB_RF_MUST_SUCCEED,
FTAG, (dmu_buf_t **)&db); FTAG, (dmu_buf_t **)&db);
ASSERT(error == 0); ASSERT(error == 0);
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);

View File

@ -2135,7 +2135,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (data != NULL) { if (data != NULL) {
dmu_buf_t *db; dmu_buf_t *db;
dnode_t *dn; dnode_t *dn;
uint32_t flags = DMU_READ_NO_PREFETCH; dmu_flags_t flags = DMU_READ_NO_PREFETCH;
if (rwa->raw) if (rwa->raw)
flags |= DMU_READ_NO_DECRYPT; flags |= DMU_READ_NO_DECRYPT;
@ -2277,14 +2277,18 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
dmu_write_by_dnode(dn, dmu_write_by_dnode(dn,
drrw->drr_offset, drrw->drr_offset,
drrw->drr_logical_size, drrw->drr_logical_size,
abd_to_buf(decomp_abd), tx); abd_to_buf(decomp_abd), tx,
DMU_READ_NO_PREFETCH |
DMU_UNCACHEDIO);
} }
abd_free(decomp_abd); abd_free(decomp_abd);
} else { } else {
dmu_write_by_dnode(dn, dmu_write_by_dnode(dn,
drrw->drr_offset, drrw->drr_offset,
drrw->drr_logical_size, drrw->drr_logical_size,
abd_to_buf(abd), tx); abd_to_buf(abd), tx,
DMU_READ_NO_PREFETCH |
DMU_UNCACHEDIO);
} }
if (err == 0) if (err == 0)
abd_free(abd); abd_free(abd);
@ -2407,10 +2411,10 @@ receive_process_write_record(struct receive_writer_arg *rwa,
if (rwa->heal) { if (rwa->heal) {
blkptr_t *bp; blkptr_t *bp;
dmu_buf_t *dbp; dmu_buf_t *dbp;
int flags = DB_RF_CANFAIL; dmu_flags_t flags = DB_RF_CANFAIL;
if (rwa->raw) if (rwa->raw)
flags |= DB_RF_NO_DECRYPT; flags |= DMU_READ_NO_DECRYPT;
if (rwa->byteswap) { if (rwa->byteswap) {
dmu_object_byteswap_t byteswap = dmu_object_byteswap_t byteswap =
@ -2567,8 +2571,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
rwa->max_object = drrs->drr_object; rwa->max_object = drrs->drr_object;
VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT |
&db_spill)) != 0) { DB_RF_CANFAIL, FTAG, &db_spill)) != 0) {
dmu_buf_rele(db, FTAG); dmu_buf_rele(db, FTAG);
return (err); return (err);
} }
@ -2621,7 +2625,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
abd_free(abd); abd_free(abd);
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx,
DMU_UNCACHEDIO);
dmu_buf_rele(db, FTAG); dmu_buf_rele(db, FTAG);
dmu_buf_rele(db_spill, FTAG); dmu_buf_rele(db_spill, FTAG);

View File

@ -297,7 +297,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
} }
if (BP_GET_LEVEL(bp) > 0) { if (BP_GET_LEVEL(bp) > 0) {
uint32_t flags = ARC_FLAG_WAIT; arc_flags_t flags = ARC_FLAG_WAIT;
int32_t i, ptidx, pidx; int32_t i, ptidx, pidx;
uint32_t prefetchlimit; uint32_t prefetchlimit;
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@ -364,8 +364,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
kmem_free(czb, sizeof (zbookmark_phys_t)); kmem_free(czb, sizeof (zbookmark_phys_t));
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_FLAG_WAIT; arc_flags_t flags = ARC_FLAG_WAIT;
uint32_t zio_flags = ZIO_FLAG_CANFAIL; zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
int32_t i; int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
dnode_phys_t *child_dnp; dnode_phys_t *child_dnp;
@ -397,7 +397,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
break; break;
} }
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t zio_flags = ZIO_FLAG_CANFAIL; zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t flags = ARC_FLAG_WAIT; arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp; objset_phys_t *osp;
@ -669,7 +669,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
/* See comment on ZIL traversal in dsl_scan_visitds. */ /* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
uint32_t flags = ARC_FLAG_WAIT; arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp; objset_phys_t *osp;
arc_buf_t *buf; arc_buf_t *buf;
ASSERT(!BP_IS_REDACTED(rootbp)); ASSERT(!BP_IS_REDACTED(rootbp));

View File

@ -222,8 +222,8 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
* PARTIAL_FIRST allows caching for uncacheable blocks. It will * PARTIAL_FIRST allows caching for uncacheable blocks. It will
* be cleared after dmu_buf_will_dirty() call dbuf_read() again. * be cleared after dmu_buf_will_dirty() call dbuf_read() again.
*/ */
err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | err = dbuf_read(db, zio, DB_RF_CANFAIL | DMU_READ_NO_PREFETCH |
(level == 0 ? DB_RF_PARTIAL_FIRST : 0)); (level == 0 ? (DMU_UNCACHEDIO | DMU_PARTIAL_FIRST) : 0));
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
return (err); return (err);
} }

View File

@ -690,7 +690,7 @@ prescient:
void void
dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
boolean_t have_lock) boolean_t have_lock, boolean_t uncached)
{ {
int64_t pf_start, pf_end, ipf_start, ipf_end; int64_t pf_start, pf_end, ipf_start, ipf_end;
int epbs, issued; int epbs, issued;
@ -745,7 +745,8 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
issued = 0; issued = 0;
for (int64_t blk = pf_start; blk < pf_end; blk++) { for (int64_t blk = pf_start; blk < pf_end; blk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); ZIO_PRIORITY_ASYNC_READ, uncached ?
ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs);
} }
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
@ -761,13 +762,13 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
void void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
boolean_t missed, boolean_t have_lock) boolean_t missed, boolean_t have_lock, boolean_t uncached)
{ {
zstream_t *zs; zstream_t *zs;
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
if (zs) if (zs)
dmu_zfetch_run(zf, zs, missed, have_lock); dmu_zfetch_run(zf, zs, missed, have_lock, uncached);
} }
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,

View File

@ -1510,7 +1510,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* if we get the encrypted or decrypted version. * if we get the encrypted or decrypted version.
*/ */
err = dbuf_read(db, NULL, DB_RF_CANFAIL | err = dbuf_read(db, NULL, DB_RF_CANFAIL |
DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
if (err) { if (err) {
DNODE_STAT_BUMP(dnode_hold_dbuf_read); DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
@ -2578,7 +2578,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
} }
error = dbuf_read(db, NULL, error = dbuf_read(db, NULL,
DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH); DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
if (error) { if (error) {
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
return (error); return (error);

View File

@ -513,6 +513,7 @@ dnode_evict_dbufs(dnode_t *dn)
avl_remove(&dn->dn_dbufs, db_marker); avl_remove(&dn->dn_dbufs, db_marker);
} else { } else {
db->db_pending_evict = TRUE; db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db); db_next = AVL_NEXT(&dn->dn_dbufs, db);
} }

View File

@ -703,8 +703,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
boolean_t dummy; boolean_t dummy;
if (hdl->sa_spill == NULL) { if (hdl->sa_spill == NULL) {
VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL, VERIFY0(dmu_spill_hold_by_bonus(hdl->sa_bonus,
&hdl->sa_spill) == 0); DB_RF_MUST_SUCCEED, NULL, &hdl->sa_spill));
} }
dmu_buf_will_dirty(hdl->sa_spill, tx); dmu_buf_will_dirty(hdl->sa_spill, tx);

View File

@ -948,7 +948,8 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
} }
void void
spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags)
{ {
spa_history_kstat_t *shk = &spa->spa_stats.iostats; spa_history_kstat_t *shk = &spa->spa_stats.iostats;
kstat_t *ksp = shk->kstat; kstat_t *ksp = shk->kstat;
@ -967,7 +968,8 @@ spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
} }
void void
spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
dmu_flags_t flags)
{ {
spa_history_kstat_t *shk = &spa->spa_stats.iostats; spa_history_kstat_t *shk = &spa->spa_stats.iostats;
kstat_t *ksp = shk->kstat; kstat_t *ksp = shk->kstat;

View File

@ -669,7 +669,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
int err; int err;
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
err = dmu_read_by_dnode(DB_DNODE(db), off, len, err = dmu_read_by_dnode(DB_DNODE(db), off, len,
&lr->lr_data[0], DMU_READ_NO_PREFETCH); &lr->lr_data[0], DMU_READ_NO_PREFETCH |
DMU_KEEP_CACHING);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
if (err != 0) { if (err != 0) {
zil_itx_destroy(itx); zil_itx_destroy(itx);

View File

@ -89,6 +89,12 @@ static int zfs_dio_enabled = 0;
static int zfs_dio_enabled = 1; static int zfs_dio_enabled = 1;
#endif #endif
/*
* Strictly enforce alignment for Direct I/O requests, returning EINVAL
* if not page-aligned instead of silently falling back to uncached I/O.
*/
static int zfs_dio_strict = 0;
/* /*
* Maximum bytes to read per chunk in zfs_read(). * Maximum bytes to read per chunk in zfs_read().
@ -243,46 +249,54 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
int ioflag = *ioflagp; int ioflag = *ioflagp;
int error = 0; int error = 0;
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED || if (os->os_direct == ZFS_DIRECT_ALWAYS) {
zn_has_cached_data(zp, zfs_uio_offset(uio), /* Force either direct or uncached I/O. */
ioflag |= O_DIRECT;
}
if ((ioflag & O_DIRECT) == 0)
goto out;
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) {
/*
* Direct I/O is disabled. The I/O request will be directed
* through the ARC as uncached I/O.
*/
goto out;
}
if (!zfs_uio_page_aligned(uio) ||
!zfs_uio_aligned(uio, PAGE_SIZE)) {
/*
* Misaligned requests can be executed through the ARC as
* uncached I/O. But if O_DIRECT was set by user and we
* were set to be strict, then it is a failure.
*/
if ((*ioflagp & O_DIRECT) && zfs_dio_strict)
error = SET_ERROR(EINVAL);
goto out;
}
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
/* /*
* Direct I/O is disabled or the region is mmap'ed. In either * The region is mmap'ed. The I/O request will be directed
* case the I/O request will just directed through the ARC. * through the ARC as uncached I/O.
*/ */
ioflag &= ~O_DIRECT;
goto out; goto out;
} else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
zfs_uio_page_aligned(uio) &&
zfs_uio_aligned(uio, PAGE_SIZE)) {
if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
(rw == UIO_READ)) {
ioflag |= O_DIRECT;
}
} else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
/*
* Direct I/O was requested through the direct=always, but it
* is not properly PAGE_SIZE aligned. The request will be
* directed through the ARC.
*/
ioflag &= ~O_DIRECT;
} }
if (ioflag & O_DIRECT) { /*
if (!zfs_uio_page_aligned(uio) || * For short writes the page mapping of Direct I/O makes no sense.
!zfs_uio_aligned(uio, PAGE_SIZE)) { * Direct them through the ARC as uncached I/O.
error = SET_ERROR(EINVAL); */
goto out; if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz)
} goto out;
error = zfs_uio_get_dio_pages_alloc(uio, rw); error = zfs_uio_get_dio_pages_alloc(uio, rw);
if (error) { if (error)
goto out; goto out;
} ASSERT(uio->uio_extflg & UIO_DIRECT);
}
IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
ASSERT0(error);
out: out:
*ioflagp = ioflag; *ioflagp = ioflag;
@ -392,6 +406,9 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
ssize_t start_resid = n; ssize_t start_resid = n;
ssize_t dio_remaining_resid = 0; ssize_t dio_remaining_resid = 0;
dmu_flags_t dflags = DMU_READ_PREFETCH;
if (ioflag & O_DIRECT)
dflags |= DMU_UNCACHEDIO;
if (uio->uio_extflg & UIO_DIRECT) { if (uio->uio_extflg & UIO_DIRECT) {
/* /*
* All pages for an O_DIRECT request ahve already been mapped * All pages for an O_DIRECT request ahve already been mapped
@ -414,6 +431,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
if (dio_remaining_resid != 0) if (dio_remaining_resid != 0)
n -= dio_remaining_resid; n -= dio_remaining_resid;
dflags |= DMU_DIRECTIO;
} }
while (n > 0) { while (n > 0) {
@ -429,7 +447,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = mappedread(zp, nbytes, uio); error = mappedread(zp, nbytes, uio);
} else { } else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes); uio, nbytes, dflags);
} }
if (error) { if (error) {
@ -479,15 +497,17 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* remainder of the file can be read using the ARC. * remainder of the file can be read using the ARC.
*/ */
uio->uio_extflg &= ~UIO_DIRECT; uio->uio_extflg &= ~UIO_DIRECT;
dflags &= ~DMU_DIRECTIO;
if (zn_has_cached_data(zp, zfs_uio_offset(uio), if (zn_has_cached_data(zp, zfs_uio_offset(uio),
zfs_uio_offset(uio) + dio_remaining_resid - 1)) { zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
error = mappedread(zp, dio_remaining_resid, uio); error = mappedread(zp, dio_remaining_resid, uio);
} else { } else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
dio_remaining_resid); dio_remaining_resid, dflags);
} }
uio->uio_extflg |= UIO_DIRECT; uio->uio_extflg |= UIO_DIRECT;
dflags |= DMU_DIRECTIO;
if (error != 0) if (error != 0)
n += dio_remaining_resid; n += dio_remaining_resid;
@ -859,12 +879,18 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_rangelock_reduce(lr, woff, n); zfs_rangelock_reduce(lr, woff, n);
} }
dmu_flags_t dflags = DMU_READ_PREFETCH;
if (ioflag & O_DIRECT)
dflags |= DMU_UNCACHEDIO;
if (uio->uio_extflg & UIO_DIRECT)
dflags |= DMU_DIRECTIO;
ssize_t tx_bytes; ssize_t tx_bytes;
if (abuf == NULL) { if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio); tx_bytes = zfs_uio_resid(uio);
zfs_uio_fault_disable(uio, B_TRUE); zfs_uio_fault_disable(uio, B_TRUE);
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx); uio, nbytes, tx, dflags);
zfs_uio_fault_disable(uio, B_FALSE); zfs_uio_fault_disable(uio, B_FALSE);
#ifdef __linux__ #ifdef __linux__
if (error == EFAULT) { if (error == EFAULT) {
@ -903,7 +929,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* arc buffer to a dbuf. * arc buffer to a dbuf.
*/ */
error = dmu_assign_arcbuf_by_dbuf( error = dmu_assign_arcbuf_by_dbuf(
sa_get_db(zp->z_sa_hdl), woff, abuf, tx); sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags);
if (error != 0) { if (error != 0) {
/* /*
* XXX This might not be necessary if * XXX This might not be necessary if
@ -1329,7 +1355,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
error = SET_ERROR(ENOENT); error = SET_ERROR(ENOENT);
} else { } else {
error = dmu_read(os, object, offset, size, buf, error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
} }
ASSERT(error == 0 || error == ENOENT); ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */ } else { /* indirect write */
@ -2019,3 +2045,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
"Enable Direct I/O"); "Enable Direct I/O");
ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW,
"Return errors on misaligned Direct I/O");

View File

@ -900,8 +900,9 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
itx = zil_itx_create(TX_WRITE, sizeof (*lr) + itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
(wr_state == WR_COPIED ? len : 0)); (wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr; lr = (lr_write_t *)&itx->itx_lr;
if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, if (wr_state == WR_COPIED &&
offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1,
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) {
zil_itx_destroy(itx); zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr)); itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr; lr = (lr_write_t *)&itx->itx_lr;
@ -994,7 +995,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
size, RL_READER); size, RL_READER);
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
} else { /* indirect write */ } else { /* indirect write */
ASSERT3P(zio, !=, NULL); ASSERT3P(zio, !=, NULL);
/* /*

View File

@ -107,6 +107,7 @@ VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
DIO_ENABLED dio_enabled zfs_dio_enabled DIO_ENABLED dio_enabled zfs_dio_enabled
DIO_STRICT dio_strict zfs_dio_strict
XATTR_COMPAT xattr_compat zfs_xattr_compat XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max

View File

@ -40,8 +40,10 @@
verify_runnable "global" verify_runnable "global"
log_must save_tunable DIO_STRICT
function cleanup function cleanup
{ {
restore_tunable DIO_STRICT
zfs set recordsize=$rs $TESTPOOL/$TESTFS zfs set recordsize=$rs $TESTPOOL/$TESTFS
zfs set direct=standard $TESTPOOL/$TESTFS zfs set direct=standard $TESTPOOL/$TESTFS
log_must rm -f $tmp_file log_must rm -f $tmp_file
@ -61,6 +63,13 @@ file_size=$((rs * 8))
log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1 log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1
log_must set_tunable32 DIO_STRICT 0
log_must zfs set direct=standard $TESTPOOL/$TESTFS
# sub-pagesize direct writes/read will always pass if not strict.
log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D
log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d
log_must set_tunable32 DIO_STRICT 1
log_must zfs set direct=standard $TESTPOOL/$TESTFS log_must zfs set direct=standard $TESTPOOL/$TESTFS
# sub-pagesize direct writes/read will always fail if direct=standard. # sub-pagesize direct writes/read will always fail if direct=standard.
log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D

View File

@ -48,6 +48,7 @@ TESTDS=${TESTPOOL}/${TESTFS}
TESTFILE=${TESTDIR}/${TESTFILE0} TESTFILE=${TESTDIR}/${TESTFILE0}
log_must save_tunable DIO_ENABLED log_must save_tunable DIO_ENABLED
log_must save_tunable DIO_STRICT
typeset recordsize_saved=$(get_prop recordsize $TESTDS) typeset recordsize_saved=$(get_prop recordsize $TESTDS)
typeset direct_saved=$(get_prop direct $TESTDS) typeset direct_saved=$(get_prop direct $TESTDS)
@ -57,6 +58,7 @@ function cleanup
zfs set recordsize=$recordsize_saved $TESTDS zfs set recordsize=$recordsize_saved $TESTDS
zfs set direct=$direct_saved $TESTDS zfs set direct=$direct_saved $TESTDS
restore_tunable DIO_ENABLED restore_tunable DIO_ENABLED
restore_tunable DIO_STRICT
} }
log_onexit cleanup log_onexit cleanup
@ -154,6 +156,7 @@ for krs in 4 8 16 32 64 128 256 512 ; do
done done
# reset for write tests # reset for write tests
log_must set_tunable32 DIO_STRICT 1
log_must zfs set recordsize=16K $TESTDS log_must zfs set recordsize=16K $TESTDS
log_must zfs set direct=standard $TESTDS log_must zfs set direct=standard $TESTDS
@ -173,4 +176,12 @@ log_must zpool sync
assert_dioalign $TESTFILE $PAGE_SIZE 16384 assert_dioalign $TESTFILE $PAGE_SIZE 16384
log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
# same again, but without strict, which should succeed.
log_must set_tunable32 DIO_STRICT 0
log_must rm -f $TESTFILE
log_must touch $TESTFILE
log_must zpool sync
assert_dioalign $TESTFILE $PAGE_SIZE 16384
log_must dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
log_pass $CLAIM log_pass $CLAIM