mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version I called Uncached I/O. It uses normal DMU/ARC data path with some optimizations, but evicts data from caches as soon as possible and reasonable. Originally I wired it only to a primarycache property, but now completing the integration all the way up to the VFS. While Direct I/O has the lowest possible memory bandwidth usage, it also has a significant number of limitations. It require I/Os to be page aligned, does not allow speculative prefetch, etc. The Uncached I/O does not have those limitations, but instead require additional memory copy, though still one less than regular cached I/O. As such it should fill the gap in between. Considering this I've disabled annoying EINVAL errors on misaligned requests, adding a tunable for those who wants to test their applications. To pass the information between the layers I had to change a number of APIs. But as side effect upper layers can now control not only the caching, but also speculative prefetch. I haven't wired it to VFS yet, since it require looking on some OS specifics. But while there I've implemented speculative prefetch of indirect blocks for Direct I/O, controllable via all the same mechanisms. Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Fixes #17027 Reviewed-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
@@ -41,7 +41,6 @@
|
||||
#include <sys/dsl_pool.h>
|
||||
#include <sys/dsl_synctask.h>
|
||||
#include <sys/dsl_prop.h>
|
||||
#include <sys/dmu_zfetch.h>
|
||||
#include <sys/zfs_ioctl.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/zio_checksum.h>
|
||||
@@ -71,6 +70,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
struct sf_buf *sf;
|
||||
int numbufs, i;
|
||||
int err;
|
||||
dmu_flags_t flags = 0;
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
@@ -94,10 +94,17 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
|
||||
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
|
||||
|
||||
if (tocpy == db->db_size)
|
||||
if (tocpy == db->db_size) {
|
||||
dmu_buf_will_fill(db, tx, B_FALSE);
|
||||
else
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
} else {
|
||||
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
|
||||
if (bufoff == 0)
|
||||
flags |= DMU_PARTIAL_FIRST;
|
||||
else
|
||||
flags |= DMU_PARTIAL_MORE;
|
||||
}
|
||||
dmu_buf_will_dirty_flags(db, tx, flags);
|
||||
}
|
||||
|
||||
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
|
||||
ASSERT3U(ptoa((*ma)->pindex), ==,
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
#include <sys/racct.h>
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
curthread->td_ru.ru_inblock += iops;
|
||||
#ifdef RACCT
|
||||
@@ -46,7 +46,7 @@ zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
curthread->td_ru.ru_oublock += iops;
|
||||
#ifdef RACCT
|
||||
|
||||
@@ -530,7 +530,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
||||
page_unhold(pp);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, bytes);
|
||||
uio, bytes, DMU_READ_PREFETCH);
|
||||
}
|
||||
len -= bytes;
|
||||
off = 0;
|
||||
|
||||
@@ -679,7 +679,7 @@ zvol_strategy_impl(zv_request_t *zvr)
|
||||
while (resid != 0 && off < volsize) {
|
||||
size_t size = MIN(resid, zvol_maxphys);
|
||||
if (doread) {
|
||||
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
|
||||
error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
|
||||
DMU_READ_PREFETCH);
|
||||
} else {
|
||||
dmu_tx_t *tx = dmu_tx_create(os);
|
||||
@@ -688,7 +688,8 @@ zvol_strategy_impl(zv_request_t *zvr)
|
||||
if (error) {
|
||||
dmu_tx_abort(tx);
|
||||
} else {
|
||||
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
|
||||
dmu_write_by_dnode(zv->zv_dn, off, size, addr,
|
||||
tx, DMU_READ_PREFETCH);
|
||||
zvol_log_write(zv, tx, off, size, commit);
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
@@ -834,7 +835,8 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
|
||||
if (bytes > volsize - zfs_uio_offset(&uio))
|
||||
bytes = volsize - zfs_uio_offset(&uio);
|
||||
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error) {
|
||||
/* Convert checksum errors into IO errors. */
|
||||
if (error == ECKSUM)
|
||||
@@ -893,7 +895,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
|
||||
dmu_tx_abort(tx);
|
||||
break;
|
||||
}
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error == 0)
|
||||
zvol_log_write(zv, tx, off, bytes, commit);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
@@ -30,14 +30,14 @@
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
task_io_account_read(size);
|
||||
spa_iostats_read_add(spa, size, iops, flags);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
task_io_account_write(size);
|
||||
spa_iostats_write_add(spa, size, iops, flags);
|
||||
@@ -46,13 +46,13 @@ zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
#else
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
(void) spa, (void) size, (void) iops, (void) flags;
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, dmu_flags_t flags)
|
||||
{
|
||||
(void) spa, (void) size, (void) iops, (void) flags;
|
||||
}
|
||||
|
||||
@@ -329,7 +329,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
||||
put_page(pp);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, bytes);
|
||||
uio, bytes, DMU_READ_PREFETCH);
|
||||
}
|
||||
|
||||
len -= bytes;
|
||||
|
||||
@@ -258,7 +258,8 @@ zvol_write(zv_request_t *zvr)
|
||||
dmu_tx_abort(tx);
|
||||
break;
|
||||
}
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
|
||||
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error == 0) {
|
||||
zvol_log_write(zv, tx, off, bytes, sync);
|
||||
}
|
||||
@@ -428,7 +429,8 @@ zvol_read(zv_request_t *zvr)
|
||||
if (bytes > volsize - uio.uio_loffset)
|
||||
bytes = volsize - uio.uio_loffset;
|
||||
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
|
||||
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
|
||||
DMU_READ_PREFETCH);
|
||||
if (error) {
|
||||
/* convert checksum errors into IO errors */
|
||||
if (error == ECKSUM)
|
||||
|
||||
+3
-1
@@ -6103,7 +6103,9 @@ top:
|
||||
ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
|
||||
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
|
||||
metadata, misses);
|
||||
zfs_racct_read(spa, size, 1, 0);
|
||||
zfs_racct_read(spa, size, 1,
|
||||
(*arc_flags & ARC_FLAG_UNCACHED) ?
|
||||
DMU_UNCACHEDIO : 0);
|
||||
}
|
||||
|
||||
/* Check if the spa even has l2 configured */
|
||||
|
||||
+55
-33
@@ -1499,7 +1499,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
||||
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
objset_t *os = db->db_objset;
|
||||
dmu_buf_impl_t *dndb;
|
||||
@@ -1507,7 +1508,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
zbookmark_phys_t zb;
|
||||
int err;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
||||
if ((flags & DMU_READ_NO_DECRYPT) != 0 ||
|
||||
!os->os_encrypted || os->os_raw_receive ||
|
||||
(dndb = dn->dn_dbuf) == NULL)
|
||||
return (0);
|
||||
@@ -1561,7 +1562,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
* returning.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
|
||||
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
|
||||
{
|
||||
zbookmark_phys_t zb;
|
||||
@@ -1627,7 +1628,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
zio_flags = (flags & DB_RF_CANFAIL) ?
|
||||
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
|
||||
if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp))
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
|
||||
/*
|
||||
@@ -1728,7 +1729,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
||||
}
|
||||
|
||||
int
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags)
|
||||
{
|
||||
dnode_t *dn;
|
||||
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
|
||||
@@ -1748,12 +1749,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
goto done;
|
||||
|
||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||
(flags & DB_RF_NOPREFETCH) == 0;
|
||||
(flags & DMU_READ_NO_PREFETCH) == 0;
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (flags & DB_RF_PARTIAL_FIRST)
|
||||
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
|
||||
db->db_pending_evict = B_FALSE;
|
||||
if (flags & DMU_PARTIAL_FIRST)
|
||||
db->db_partial_read = B_TRUE;
|
||||
else if (!(flags & DB_RF_PARTIAL_MORE))
|
||||
else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING)))
|
||||
db->db_partial_read = B_FALSE;
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
|
||||
@@ -1794,7 +1797,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
* unauthenticated blocks, which will verify their MAC if
|
||||
* the key is now available.
|
||||
*/
|
||||
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||
if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||
(arc_is_encrypted(db->db_buf) ||
|
||||
arc_is_unauthenticated(db->db_buf) ||
|
||||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
||||
@@ -1842,7 +1845,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
|
||||
if (err == 0 && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
|
||||
flags & DB_RF_HAVESTRUCT);
|
||||
flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) ||
|
||||
db->db_pending_evict);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
@@ -1874,11 +1878,14 @@ done:
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_noread(dmu_buf_impl_t *db)
|
||||
dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
|
||||
{
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
|
||||
db->db_pending_evict = B_FALSE;
|
||||
db->db_partial_read = B_FALSE;
|
||||
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
if (db->db_state == DB_UNCACHED) {
|
||||
@@ -2191,8 +2198,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
|
||||
kmem_free(dr, sizeof (*dr));
|
||||
return (NULL);
|
||||
}
|
||||
int err = dbuf_read(parent_db, NULL,
|
||||
(DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
||||
int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL |
|
||||
DMU_READ_NO_PREFETCH);
|
||||
if (err != 0) {
|
||||
dbuf_rele(parent_db, FTAG);
|
||||
kmem_free(dr, sizeof (*dr));
|
||||
@@ -2620,8 +2627,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
static void
|
||||
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
||||
void
|
||||
dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
boolean_t undirty = B_FALSE;
|
||||
@@ -2673,7 +2680,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
||||
* not the uderlying block that is being replaced. dbuf_undirty() will
|
||||
* do brt_pending_remove() before removing the dirty record.
|
||||
*/
|
||||
(void) dbuf_read(db, NULL, flags);
|
||||
(void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED);
|
||||
if (undirty) {
|
||||
mutex_enter(&db->db_mtx);
|
||||
VERIFY(!dbuf_undirty(db, tx));
|
||||
@@ -2685,8 +2692,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
||||
void
|
||||
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_will_dirty_impl(db_fake,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
|
||||
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
@@ -2850,7 +2856,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
DBUF_VERIFY(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
dbuf_noread(db, DMU_KEEP_CACHING);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
}
|
||||
|
||||
@@ -2864,12 +2870,13 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
dbuf_noread(db, DMU_KEEP_CACHING);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
|
||||
@@ -2891,7 +2898,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
*/
|
||||
if (canfail && dr) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
dmu_buf_will_dirty(db_fake, tx);
|
||||
dmu_buf_will_dirty_flags(db_fake, tx, flags);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
@@ -2907,10 +2914,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
dbuf_noread(db, flags);
|
||||
(void) dbuf_dirty(db, tx);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
||||
{
|
||||
dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is effectively the same as dmu_buf_will_dirty(), but
|
||||
* indicates the caller expects raw encrypted data in the db, and provides
|
||||
@@ -2933,8 +2946,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
|
||||
ASSERT0(db->db_level);
|
||||
ASSERT(db->db_objset->os_raw_receive);
|
||||
|
||||
dmu_buf_will_dirty_impl(db_fake,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
|
||||
dmu_buf_will_dirty_flags(db_fake, tx,
|
||||
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
|
||||
|
||||
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
||||
|
||||
@@ -3076,7 +3089,8 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
|
||||
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
|
||||
*/
|
||||
void
|
||||
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
@@ -3090,6 +3104,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
||||
ASSERT(arc_released(buf));
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING)))
|
||||
db->db_pending_evict = B_FALSE;
|
||||
db->db_partial_read = B_FALSE;
|
||||
|
||||
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
@@ -3344,8 +3361,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
||||
|
||||
if (err)
|
||||
return (err);
|
||||
err = dbuf_read(*parentp, NULL,
|
||||
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
||||
err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL |
|
||||
DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH);
|
||||
if (err) {
|
||||
dbuf_rele(*parentp, NULL);
|
||||
*parentp = NULL;
|
||||
@@ -3404,7 +3421,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
db->db_user = NULL;
|
||||
db->db_user_immediate_evict = FALSE;
|
||||
db->db_freed_in_flight = FALSE;
|
||||
db->db_pending_evict = FALSE;
|
||||
db->db_pending_evict = TRUE;
|
||||
db->db_partial_read = FALSE;
|
||||
|
||||
if (blkid == DMU_BONUS_BLKID) {
|
||||
ASSERT3P(parent, ==, dn->dn_dbuf);
|
||||
@@ -3615,8 +3633,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
||||
dbuf_prefetch_fini(dpa, B_TRUE);
|
||||
return;
|
||||
}
|
||||
(void) dbuf_read(db, NULL,
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
|
||||
(void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
|
||||
DMU_READ_NO_PREFETCH);
|
||||
dbuf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
@@ -4002,6 +4020,7 @@ dbuf_create_bonus(dnode_t *dn)
|
||||
ASSERT(dn->dn_bonus == NULL);
|
||||
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
|
||||
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
|
||||
dn->dn_bonus->db_pending_evict = FALSE;
|
||||
}
|
||||
|
||||
int
|
||||
@@ -4167,8 +4186,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
|
||||
* This dbuf has anonymous data associated with it.
|
||||
*/
|
||||
dbuf_destroy(db);
|
||||
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
|
||||
db->db_pending_evict) {
|
||||
} else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) {
|
||||
/*
|
||||
* We don't expect more accesses to the dbuf, and it
|
||||
* is either not cacheable or was marked for eviction.
|
||||
*/
|
||||
dbuf_destroy(db);
|
||||
} else if (!multilist_link_active(&db->db_cache_link)) {
|
||||
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
|
||||
|
||||
+87
-98
@@ -222,20 +222,14 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
|
||||
|
||||
int
|
||||
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
|
||||
const void *tag, dmu_buf_t **dbp, int flags)
|
||||
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
|
||||
{
|
||||
int err;
|
||||
int db_flags = DB_RF_CANFAIL;
|
||||
|
||||
if (flags & DMU_READ_NO_PREFETCH)
|
||||
db_flags |= DB_RF_NOPREFETCH;
|
||||
if (flags & DMU_READ_NO_DECRYPT)
|
||||
db_flags |= DB_RF_NO_DECRYPT;
|
||||
|
||||
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
|
||||
if (err == 0) {
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
|
||||
err = dbuf_read(db, NULL, db_flags);
|
||||
err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
|
||||
if (err != 0) {
|
||||
dbuf_rele(db, tag);
|
||||
*dbp = NULL;
|
||||
@@ -247,20 +241,14 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
|
||||
|
||||
int
|
||||
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
|
||||
const void *tag, dmu_buf_t **dbp, int flags)
|
||||
const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
|
||||
{
|
||||
int err;
|
||||
int db_flags = DB_RF_CANFAIL;
|
||||
|
||||
if (flags & DMU_READ_NO_PREFETCH)
|
||||
db_flags |= DB_RF_NOPREFETCH;
|
||||
if (flags & DMU_READ_NO_DECRYPT)
|
||||
db_flags |= DB_RF_NO_DECRYPT;
|
||||
|
||||
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
|
||||
if (err == 0) {
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
|
||||
err = dbuf_read(db, NULL, db_flags);
|
||||
err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
|
||||
if (err != 0) {
|
||||
dbuf_rele(db, tag);
|
||||
*dbp = NULL;
|
||||
@@ -358,16 +346,10 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
|
||||
* Returns ENOENT, EIO, or 0.
|
||||
*/
|
||||
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
|
||||
uint32_t flags)
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db;
|
||||
int error;
|
||||
uint32_t db_flags = DB_RF_MUST_SUCCEED;
|
||||
|
||||
if (flags & DMU_READ_NO_PREFETCH)
|
||||
db_flags |= DB_RF_NOPREFETCH;
|
||||
if (flags & DMU_READ_NO_DECRYPT)
|
||||
db_flags |= DB_RF_NO_DECRYPT;
|
||||
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
if (dn->dn_bonus == NULL) {
|
||||
@@ -393,7 +375,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
|
||||
*/
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
error = dbuf_read(db, NULL, db_flags);
|
||||
error = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
|
||||
if (error) {
|
||||
dnode_evict_bonus(dn);
|
||||
dbuf_rele(db, tag);
|
||||
@@ -431,7 +413,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
|
||||
* dmu_spill_hold_existing() should be used.
|
||||
*/
|
||||
int
|
||||
dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
|
||||
dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag,
|
||||
dmu_buf_t **dbp)
|
||||
{
|
||||
dmu_buf_impl_t *db = NULL;
|
||||
@@ -489,18 +471,14 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
|
||||
}
|
||||
|
||||
int
|
||||
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
|
||||
dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, const void *tag,
|
||||
dmu_buf_t **dbp)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
|
||||
int err;
|
||||
uint32_t db_flags = DB_RF_CANFAIL;
|
||||
|
||||
if (flags & DMU_READ_NO_DECRYPT)
|
||||
db_flags |= DB_RF_NO_DECRYPT;
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp);
|
||||
err = dmu_spill_hold_by_dnode(DB_DNODE(db), flags, tag, dbp);
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
return (err);
|
||||
@@ -515,12 +493,12 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
|
||||
int
|
||||
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
|
||||
uint32_t flags)
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
zstream_t *zs = NULL;
|
||||
uint64_t blkid, nblks, i;
|
||||
uint32_t dbuf_flags;
|
||||
dmu_flags_t dbuf_flags;
|
||||
int err;
|
||||
zio_t *zio = NULL;
|
||||
boolean_t missed = B_FALSE;
|
||||
@@ -532,11 +510,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
* we can tell it about the multi-block read. dbuf_read() only knows
|
||||
* about the one block it is accessing.
|
||||
*/
|
||||
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
|
||||
DB_RF_NOPREFETCH;
|
||||
|
||||
if ((flags & DMU_READ_NO_DECRYPT) != 0)
|
||||
dbuf_flags |= DB_RF_NO_DECRYPT;
|
||||
dbuf_flags = (flags & ~DMU_READ_PREFETCH) | DMU_READ_NO_PREFETCH |
|
||||
DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
|
||||
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
if (dn->dn_datablkshift) {
|
||||
@@ -569,15 +544,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
* that if multiple threads block on same indirect block, we
|
||||
* base predictions on the original less racy request order.
|
||||
*/
|
||||
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
|
||||
B_TRUE);
|
||||
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
|
||||
read && !(flags & DMU_DIRECTIO), B_TRUE);
|
||||
}
|
||||
for (i = 0; i < nblks; i++) {
|
||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
||||
if (db == NULL) {
|
||||
if (zs) {
|
||||
dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
|
||||
B_TRUE);
|
||||
B_TRUE, (flags & DMU_UNCACHEDIO));
|
||||
}
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dmu_buf_rele_array(dbp, nblks, tag);
|
||||
@@ -599,9 +574,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
offset + length < db->db.db_offset +
|
||||
db->db.db_size) {
|
||||
if (offset <= db->db.db_offset)
|
||||
dbuf_flags |= DB_RF_PARTIAL_FIRST;
|
||||
dbuf_flags |= DMU_PARTIAL_FIRST;
|
||||
else
|
||||
dbuf_flags |= DB_RF_PARTIAL_MORE;
|
||||
dbuf_flags |= DMU_PARTIAL_MORE;
|
||||
}
|
||||
(void) dbuf_read(db, zio, dbuf_flags);
|
||||
if (db->db_state != DB_CACHED)
|
||||
@@ -621,8 +596,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
if (!read && ((flags & DMU_DIRECTIO) == 0))
|
||||
zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
|
||||
|
||||
if (zs)
|
||||
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
|
||||
if (zs) {
|
||||
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE,
|
||||
(flags & DMU_UNCACHEDIO));
|
||||
}
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
if (read) {
|
||||
@@ -1170,7 +1147,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
|
||||
|
||||
static int
|
||||
dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
void *buf, uint32_t flags)
|
||||
void *buf, dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs, err = 0;
|
||||
@@ -1198,6 +1175,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
abd_free(data);
|
||||
return (err);
|
||||
}
|
||||
flags &= ~DMU_DIRECTIO;
|
||||
|
||||
while (size > 0) {
|
||||
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
|
||||
@@ -1236,7 +1214,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
|
||||
int
|
||||
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
void *buf, uint32_t flags)
|
||||
void *buf, dmu_flags_t flags)
|
||||
{
|
||||
dnode_t *dn;
|
||||
int err;
|
||||
@@ -1252,14 +1230,14 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
|
||||
int
|
||||
dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
|
||||
uint32_t flags)
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
return (dmu_read_impl(dn, offset, size, buf, flags));
|
||||
}
|
||||
|
||||
static void
|
||||
dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx)
|
||||
const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
int i;
|
||||
|
||||
@@ -1275,10 +1253,17 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
|
||||
|
||||
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
|
||||
|
||||
if (tocpy == db->db_size)
|
||||
dmu_buf_will_fill(db, tx, B_FALSE);
|
||||
else
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
if (tocpy == db->db_size) {
|
||||
dmu_buf_will_fill_flags(db, tx, B_FALSE, flags);
|
||||
} else {
|
||||
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
|
||||
if (bufoff == 0)
|
||||
flags |= DMU_PARTIAL_FIRST;
|
||||
else
|
||||
flags |= DMU_PARTIAL_MORE;
|
||||
}
|
||||
dmu_buf_will_dirty_flags(db, tx, flags);
|
||||
}
|
||||
|
||||
ASSERT(db->db_data != NULL);
|
||||
(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
|
||||
@@ -1304,17 +1289,13 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
|
||||
VERIFY0(dmu_buf_hold_array(os, object, offset, size,
|
||||
FALSE, FTAG, &numbufs, &dbp));
|
||||
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
|
||||
dmu_write_impl(dbp, numbufs, offset, size, buf, tx, DMU_READ_PREFETCH);
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
* This interface is not used internally by ZFS but is provided for
|
||||
* use by Lustre which is built on the DMU interfaces.
|
||||
*/
|
||||
int
|
||||
dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx, uint32_t flags)
|
||||
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs;
|
||||
@@ -1327,25 +1308,19 @@ dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
|
||||
zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
|
||||
abd_t *data = abd_get_from_buf((void *)buf, size);
|
||||
error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
|
||||
error = dmu_write_abd(dn, offset, size, data, flags, tx);
|
||||
abd_free(data);
|
||||
return (error);
|
||||
}
|
||||
flags &= ~DMU_DIRECTIO;
|
||||
|
||||
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
|
||||
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
|
||||
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
|
||||
FALSE, FTAG, &numbufs, &dbp, flags));
|
||||
dmu_write_impl(dbp, numbufs, offset, size, buf, tx, flags);
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx)
|
||||
{
|
||||
return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
|
||||
}
|
||||
|
||||
void
|
||||
dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
dmu_tx_t *tx)
|
||||
@@ -1402,20 +1377,22 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
|
||||
#ifdef _KERNEL
|
||||
int
|
||||
dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
||||
dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs, i, err;
|
||||
|
||||
if (uio->uio_extflg & UIO_DIRECT)
|
||||
return (dmu_read_uio_direct(dn, uio, size));
|
||||
return (dmu_read_uio_direct(dn, uio, size, flags));
|
||||
flags &= ~DMU_DIRECTIO;
|
||||
|
||||
/*
|
||||
* NB: we could do this block-at-a-time, but it's nice
|
||||
* to be reading in parallel.
|
||||
*/
|
||||
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
|
||||
TRUE, FTAG, &numbufs, &dbp, 0);
|
||||
TRUE, FTAG, &numbufs, &dbp, flags);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
@@ -1453,7 +1430,8 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
||||
* because we don't have to find the dnode_t for the object.
|
||||
*/
|
||||
int
|
||||
dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
|
||||
dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
|
||||
int err;
|
||||
@@ -1462,7 +1440,7 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
|
||||
return (0);
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
err = dmu_read_uio_dnode(DB_DNODE(db), uio, size);
|
||||
err = dmu_read_uio_dnode(DB_DNODE(db), uio, size, flags);
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
return (err);
|
||||
@@ -1474,7 +1452,8 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
|
||||
* Starting at offset zfs_uio_offset(uio).
|
||||
*/
|
||||
int
|
||||
dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
|
||||
dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dnode_t *dn;
|
||||
int err;
|
||||
@@ -1486,7 +1465,7 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
err = dmu_read_uio_dnode(dn, uio, size);
|
||||
err = dmu_read_uio_dnode(dn, uio, size, flags);
|
||||
|
||||
dnode_rele(dn, FTAG);
|
||||
|
||||
@@ -1494,12 +1473,14 @@ dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
|
||||
dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs;
|
||||
int err = 0;
|
||||
uint64_t write_size;
|
||||
dmu_flags_t oflags = flags;
|
||||
|
||||
top:
|
||||
write_size = size;
|
||||
@@ -1512,13 +1493,14 @@ top:
|
||||
(write_size >= dn->dn_datablksz)) {
|
||||
if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
|
||||
dn->dn_datablksz)) {
|
||||
return (dmu_write_uio_direct(dn, uio, size, tx));
|
||||
return (dmu_write_uio_direct(dn, uio, size, flags, tx));
|
||||
} else if (write_size > dn->dn_datablksz &&
|
||||
zfs_dio_offset_aligned(zfs_uio_offset(uio),
|
||||
dn->dn_datablksz)) {
|
||||
write_size =
|
||||
dn->dn_datablksz * (write_size / dn->dn_datablksz);
|
||||
err = dmu_write_uio_direct(dn, uio, write_size, tx);
|
||||
err = dmu_write_uio_direct(dn, uio, write_size, flags,
|
||||
tx);
|
||||
if (err == 0) {
|
||||
size -= write_size;
|
||||
goto top;
|
||||
@@ -1530,9 +1512,10 @@ top:
|
||||
P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
|
||||
}
|
||||
}
|
||||
flags &= ~DMU_DIRECTIO;
|
||||
|
||||
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
|
||||
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
|
||||
FALSE, FTAG, &numbufs, &dbp, flags);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
@@ -1549,10 +1532,17 @@ top:
|
||||
|
||||
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
|
||||
|
||||
if (tocpy == db->db_size)
|
||||
dmu_buf_will_fill(db, tx, B_TRUE);
|
||||
else
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
if (tocpy == db->db_size) {
|
||||
dmu_buf_will_fill_flags(db, tx, B_TRUE, flags);
|
||||
} else {
|
||||
if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
|
||||
if (bufoff == 0)
|
||||
flags |= DMU_PARTIAL_FIRST;
|
||||
else
|
||||
flags |= DMU_PARTIAL_MORE;
|
||||
}
|
||||
dmu_buf_will_dirty_flags(db, tx, flags);
|
||||
}
|
||||
|
||||
ASSERT(db->db_data != NULL);
|
||||
err = zfs_uio_fault_move((char *)db->db_data + bufoff,
|
||||
@@ -1575,6 +1565,7 @@ top:
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
|
||||
if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
|
||||
flags = oflags;
|
||||
goto top;
|
||||
}
|
||||
|
||||
@@ -1592,7 +1583,7 @@ top:
|
||||
*/
|
||||
int
|
||||
dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_tx_t *tx)
|
||||
dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
|
||||
int err;
|
||||
@@ -1601,7 +1592,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
|
||||
return (0);
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx);
|
||||
err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx, flags);
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
return (err);
|
||||
@@ -1614,7 +1605,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
|
||||
*/
|
||||
int
|
||||
dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_tx_t *tx)
|
||||
dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
dnode_t *dn;
|
||||
int err;
|
||||
@@ -1626,7 +1617,7 @@ dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
err = dmu_write_uio_dnode(dn, uio, size, tx);
|
||||
err = dmu_write_uio_dnode(dn, uio, size, tx, flags);
|
||||
|
||||
dnode_rele(dn, FTAG);
|
||||
|
||||
@@ -1796,11 +1787,10 @@ dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
|
||||
*/
|
||||
int
|
||||
dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
|
||||
dmu_tx_t *tx)
|
||||
dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
dmu_buf_impl_t *db;
|
||||
objset_t *os = dn->dn_objset;
|
||||
uint64_t object = dn->dn_object;
|
||||
uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
|
||||
uint64_t blkid;
|
||||
|
||||
@@ -1816,8 +1806,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
|
||||
* same size as the dbuf.
|
||||
*/
|
||||
if (offset == db->db.db_offset && blksz == db->db.db_size) {
|
||||
zfs_racct_write(os->os_spa, blksz, 1, 0);
|
||||
dbuf_assign_arcbuf(db, buf, tx);
|
||||
zfs_racct_write(os->os_spa, blksz, 1, flags);
|
||||
dbuf_assign_arcbuf(db, buf, tx, flags);
|
||||
dbuf_rele(db, FTAG);
|
||||
} else {
|
||||
/* compressed bufs must always be assignable to their dbuf */
|
||||
@@ -1825,7 +1815,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
|
||||
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
|
||||
|
||||
dbuf_rele(db, FTAG);
|
||||
dmu_write(os, object, offset, blksz, buf->b_data, tx);
|
||||
dmu_write_by_dnode(dn, offset, blksz, buf->b_data, tx, flags);
|
||||
dmu_return_arcbuf(buf);
|
||||
}
|
||||
|
||||
@@ -1834,13 +1824,13 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
|
||||
|
||||
int
|
||||
dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
|
||||
dmu_tx_t *tx)
|
||||
dmu_tx_t *tx, dmu_flags_t flags)
|
||||
{
|
||||
int err;
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx);
|
||||
err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx, flags);
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
return (err);
|
||||
@@ -1985,7 +1975,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
|
||||
int error;
|
||||
|
||||
error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
|
||||
DB_RF_CANFAIL | DB_RF_NOPREFETCH);
|
||||
DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
@@ -2928,7 +2918,6 @@ EXPORT_SYMBOL(dmu_read_uio_dbuf);
|
||||
EXPORT_SYMBOL(dmu_read_uio_dnode);
|
||||
EXPORT_SYMBOL(dmu_write);
|
||||
EXPORT_SYMBOL(dmu_write_by_dnode);
|
||||
EXPORT_SYMBOL(dmu_write_by_dnode_flags);
|
||||
EXPORT_SYMBOL(dmu_write_uio);
|
||||
EXPORT_SYMBOL(dmu_write_uio_dbuf);
|
||||
EXPORT_SYMBOL(dmu_write_uio_dnode);
|
||||
|
||||
@@ -208,7 +208,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
|
||||
|
||||
int
|
||||
dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
abd_t *data, uint32_t flags, dmu_tx_t *tx)
|
||||
abd_t *data, dmu_flags_t flags, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
@@ -247,7 +247,7 @@ dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
|
||||
int
|
||||
dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
abd_t *data, uint32_t flags)
|
||||
abd_t *data, dmu_flags_t flags)
|
||||
{
|
||||
objset_t *os = dn->dn_objset;
|
||||
spa_t *spa = os->os_spa;
|
||||
@@ -351,7 +351,8 @@ error:
|
||||
|
||||
#ifdef _KERNEL
|
||||
int
|
||||
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
||||
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
offset_t offset = zfs_uio_offset(uio);
|
||||
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
|
||||
@@ -362,7 +363,7 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
||||
|
||||
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
|
||||
offset & (PAGESIZE - 1), size);
|
||||
err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
|
||||
err = dmu_read_abd(dn, offset, size, data, flags);
|
||||
abd_free(data);
|
||||
|
||||
if (err == 0)
|
||||
@@ -372,7 +373,8 @@ dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
|
||||
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
|
||||
dmu_flags_t flags, dmu_tx_t *tx)
|
||||
{
|
||||
offset_t offset = zfs_uio_offset(uio);
|
||||
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
|
||||
@@ -383,7 +385,7 @@ dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
|
||||
|
||||
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
|
||||
offset & (PAGESIZE - 1), size);
|
||||
err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
|
||||
err = dmu_write_abd(dn, offset, size, data, flags, tx);
|
||||
abd_free(data);
|
||||
|
||||
if (err == 0)
|
||||
|
||||
@@ -2332,12 +2332,11 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
|
||||
data = DN_BONUS(dn->dn_phys);
|
||||
}
|
||||
} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
|
||||
int rf = 0;
|
||||
dmu_flags_t rf = DB_RF_MUST_SUCCEED;
|
||||
|
||||
if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
|
||||
rf |= DB_RF_HAVESTRUCT;
|
||||
error = dmu_spill_hold_by_dnode(dn,
|
||||
rf | DB_RF_MUST_SUCCEED,
|
||||
error = dmu_spill_hold_by_dnode(dn, rf,
|
||||
FTAG, (dmu_buf_t **)&db);
|
||||
ASSERT(error == 0);
|
||||
mutex_enter(&db->db_mtx);
|
||||
|
||||
+13
-8
@@ -2135,7 +2135,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
||||
if (data != NULL) {
|
||||
dmu_buf_t *db;
|
||||
dnode_t *dn;
|
||||
uint32_t flags = DMU_READ_NO_PREFETCH;
|
||||
dmu_flags_t flags = DMU_READ_NO_PREFETCH;
|
||||
|
||||
if (rwa->raw)
|
||||
flags |= DMU_READ_NO_DECRYPT;
|
||||
@@ -2277,14 +2277,18 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
|
||||
dmu_write_by_dnode(dn,
|
||||
drrw->drr_offset,
|
||||
drrw->drr_logical_size,
|
||||
abd_to_buf(decomp_abd), tx);
|
||||
abd_to_buf(decomp_abd), tx,
|
||||
DMU_READ_NO_PREFETCH |
|
||||
DMU_UNCACHEDIO);
|
||||
}
|
||||
abd_free(decomp_abd);
|
||||
} else {
|
||||
dmu_write_by_dnode(dn,
|
||||
drrw->drr_offset,
|
||||
drrw->drr_logical_size,
|
||||
abd_to_buf(abd), tx);
|
||||
abd_to_buf(abd), tx,
|
||||
DMU_READ_NO_PREFETCH |
|
||||
DMU_UNCACHEDIO);
|
||||
}
|
||||
if (err == 0)
|
||||
abd_free(abd);
|
||||
@@ -2407,10 +2411,10 @@ receive_process_write_record(struct receive_writer_arg *rwa,
|
||||
if (rwa->heal) {
|
||||
blkptr_t *bp;
|
||||
dmu_buf_t *dbp;
|
||||
int flags = DB_RF_CANFAIL;
|
||||
dmu_flags_t flags = DB_RF_CANFAIL;
|
||||
|
||||
if (rwa->raw)
|
||||
flags |= DB_RF_NO_DECRYPT;
|
||||
flags |= DMU_READ_NO_DECRYPT;
|
||||
|
||||
if (rwa->byteswap) {
|
||||
dmu_object_byteswap_t byteswap =
|
||||
@@ -2567,8 +2571,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
|
||||
rwa->max_object = drrs->drr_object;
|
||||
|
||||
VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
|
||||
if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
|
||||
&db_spill)) != 0) {
|
||||
if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT |
|
||||
DB_RF_CANFAIL, FTAG, &db_spill)) != 0) {
|
||||
dmu_buf_rele(db, FTAG);
|
||||
return (err);
|
||||
}
|
||||
@@ -2621,7 +2625,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
|
||||
|
||||
memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
|
||||
abd_free(abd);
|
||||
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
|
||||
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx,
|
||||
DMU_UNCACHEDIO);
|
||||
|
||||
dmu_buf_rele(db, FTAG);
|
||||
dmu_buf_rele(db_spill, FTAG);
|
||||
|
||||
@@ -297,7 +297,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
}
|
||||
|
||||
if (BP_GET_LEVEL(bp) > 0) {
|
||||
uint32_t flags = ARC_FLAG_WAIT;
|
||||
arc_flags_t flags = ARC_FLAG_WAIT;
|
||||
int32_t i, ptidx, pidx;
|
||||
uint32_t prefetchlimit;
|
||||
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
|
||||
@@ -364,8 +364,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
kmem_free(czb, sizeof (zbookmark_phys_t));
|
||||
|
||||
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
|
||||
uint32_t flags = ARC_FLAG_WAIT;
|
||||
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
|
||||
arc_flags_t flags = ARC_FLAG_WAIT;
|
||||
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
|
||||
int32_t i;
|
||||
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
|
||||
dnode_phys_t *child_dnp;
|
||||
@@ -397,7 +397,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
break;
|
||||
}
|
||||
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
|
||||
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
|
||||
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
|
||||
arc_flags_t flags = ARC_FLAG_WAIT;
|
||||
objset_phys_t *osp;
|
||||
|
||||
@@ -669,7 +669,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
|
||||
/* See comment on ZIL traversal in dsl_scan_visitds. */
|
||||
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
|
||||
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
|
||||
uint32_t flags = ARC_FLAG_WAIT;
|
||||
arc_flags_t flags = ARC_FLAG_WAIT;
|
||||
objset_phys_t *osp;
|
||||
arc_buf_t *buf;
|
||||
ASSERT(!BP_IS_REDACTED(rootbp));
|
||||
|
||||
+2
-2
@@ -222,8 +222,8 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
|
||||
* PARTIAL_FIRST allows caching for uncacheable blocks. It will
|
||||
* be cleared after dmu_buf_will_dirty() call dbuf_read() again.
|
||||
*/
|
||||
err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH |
|
||||
(level == 0 ? DB_RF_PARTIAL_FIRST : 0));
|
||||
err = dbuf_read(db, zio, DB_RF_CANFAIL | DMU_READ_NO_PREFETCH |
|
||||
(level == 0 ? (DMU_UNCACHEDIO | DMU_PARTIAL_FIRST) : 0));
|
||||
dbuf_rele(db, FTAG);
|
||||
return (err);
|
||||
}
|
||||
|
||||
@@ -690,7 +690,7 @@ prescient:
|
||||
|
||||
void
|
||||
dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
|
||||
boolean_t have_lock)
|
||||
boolean_t have_lock, boolean_t uncached)
|
||||
{
|
||||
int64_t pf_start, pf_end, ipf_start, ipf_end;
|
||||
int epbs, issued;
|
||||
@@ -745,7 +745,8 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
|
||||
issued = 0;
|
||||
for (int64_t blk = pf_start; blk < pf_end; blk++) {
|
||||
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
|
||||
ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
|
||||
ZIO_PRIORITY_ASYNC_READ, uncached ?
|
||||
ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs);
|
||||
}
|
||||
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
|
||||
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
|
||||
@@ -761,13 +762,13 @@ dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
|
||||
|
||||
void
|
||||
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||
boolean_t missed, boolean_t have_lock)
|
||||
boolean_t missed, boolean_t have_lock, boolean_t uncached)
|
||||
{
|
||||
zstream_t *zs;
|
||||
|
||||
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
|
||||
if (zs)
|
||||
dmu_zfetch_run(zf, zs, missed, have_lock);
|
||||
dmu_zfetch_run(zf, zs, missed, have_lock, uncached);
|
||||
}
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
|
||||
|
||||
+2
-2
@@ -1510,7 +1510,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
* if we get the encrypted or decrypted version.
|
||||
*/
|
||||
err = dbuf_read(db, NULL, DB_RF_CANFAIL |
|
||||
DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
|
||||
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
|
||||
if (err) {
|
||||
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
|
||||
dbuf_rele(db, FTAG);
|
||||
@@ -2578,7 +2578,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
||||
}
|
||||
error = dbuf_read(db, NULL,
|
||||
DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
|
||||
DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
|
||||
DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
|
||||
if (error) {
|
||||
dbuf_rele(db, FTAG);
|
||||
return (error);
|
||||
|
||||
@@ -513,6 +513,7 @@ dnode_evict_dbufs(dnode_t *dn)
|
||||
avl_remove(&dn->dn_dbufs, db_marker);
|
||||
} else {
|
||||
db->db_pending_evict = TRUE;
|
||||
db->db_partial_read = FALSE;
|
||||
mutex_exit(&db->db_mtx);
|
||||
db_next = AVL_NEXT(&dn->dn_dbufs, db);
|
||||
}
|
||||
|
||||
+2
-2
@@ -703,8 +703,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
|
||||
boolean_t dummy;
|
||||
|
||||
if (hdl->sa_spill == NULL) {
|
||||
VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
|
||||
&hdl->sa_spill) == 0);
|
||||
VERIFY0(dmu_spill_hold_by_bonus(hdl->sa_bonus,
|
||||
DB_RF_MUST_SUCCEED, NULL, &hdl->sa_spill));
|
||||
}
|
||||
dmu_buf_will_dirty(hdl->sa_spill, tx);
|
||||
|
||||
|
||||
@@ -948,7 +948,8 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
|
||||
}
|
||||
|
||||
void
|
||||
spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
|
||||
kstat_t *ksp = shk->kstat;
|
||||
@@ -967,7 +968,8 @@ spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
}
|
||||
|
||||
void
|
||||
spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
|
||||
dmu_flags_t flags)
|
||||
{
|
||||
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
|
||||
kstat_t *ksp = shk->kstat;
|
||||
|
||||
@@ -669,7 +669,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
int err;
|
||||
DB_DNODE_ENTER(db);
|
||||
err = dmu_read_by_dnode(DB_DNODE(db), off, len,
|
||||
&lr->lr_data[0], DMU_READ_NO_PREFETCH);
|
||||
&lr->lr_data[0], DMU_READ_NO_PREFETCH |
|
||||
DMU_KEEP_CACHING);
|
||||
DB_DNODE_EXIT(db);
|
||||
if (err != 0) {
|
||||
zil_itx_destroy(itx);
|
||||
|
||||
+67
-38
@@ -89,6 +89,12 @@ static int zfs_dio_enabled = 0;
|
||||
static int zfs_dio_enabled = 1;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Strictly enforce alignment for Direct I/O requests, returning EINVAL
|
||||
* if not page-aligned instead of silently falling back to uncached I/O.
|
||||
*/
|
||||
static int zfs_dio_strict = 0;
|
||||
|
||||
|
||||
/*
|
||||
* Maximum bytes to read per chunk in zfs_read().
|
||||
@@ -243,46 +249,54 @@ zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
|
||||
int ioflag = *ioflagp;
|
||||
int error = 0;
|
||||
|
||||
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
|
||||
zn_has_cached_data(zp, zfs_uio_offset(uio),
|
||||
if (os->os_direct == ZFS_DIRECT_ALWAYS) {
|
||||
/* Force either direct or uncached I/O. */
|
||||
ioflag |= O_DIRECT;
|
||||
}
|
||||
|
||||
if ((ioflag & O_DIRECT) == 0)
|
||||
goto out;
|
||||
|
||||
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) {
|
||||
/*
|
||||
* Direct I/O is disabled. The I/O request will be directed
|
||||
* through the ARC as uncached I/O.
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!zfs_uio_page_aligned(uio) ||
|
||||
!zfs_uio_aligned(uio, PAGE_SIZE)) {
|
||||
/*
|
||||
* Misaligned requests can be executed through the ARC as
|
||||
* uncached I/O. But if O_DIRECT was set by user and we
|
||||
* were set to be strict, then it is a failure.
|
||||
*/
|
||||
if ((*ioflagp & O_DIRECT) && zfs_dio_strict)
|
||||
error = SET_ERROR(EINVAL);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
|
||||
zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
|
||||
/*
|
||||
* Direct I/O is disabled or the region is mmap'ed. In either
|
||||
* case the I/O request will just directed through the ARC.
|
||||
* The region is mmap'ed. The I/O request will be directed
|
||||
* through the ARC as uncached I/O.
|
||||
*/
|
||||
ioflag &= ~O_DIRECT;
|
||||
goto out;
|
||||
} else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
|
||||
zfs_uio_page_aligned(uio) &&
|
||||
zfs_uio_aligned(uio, PAGE_SIZE)) {
|
||||
if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
|
||||
(rw == UIO_READ)) {
|
||||
ioflag |= O_DIRECT;
|
||||
}
|
||||
} else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
|
||||
/*
|
||||
* Direct I/O was requested through the direct=always, but it
|
||||
* is not properly PAGE_SIZE aligned. The request will be
|
||||
* directed through the ARC.
|
||||
*/
|
||||
ioflag &= ~O_DIRECT;
|
||||
}
|
||||
|
||||
if (ioflag & O_DIRECT) {
|
||||
if (!zfs_uio_page_aligned(uio) ||
|
||||
!zfs_uio_aligned(uio, PAGE_SIZE)) {
|
||||
error = SET_ERROR(EINVAL);
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* For short writes the page mapping of Direct I/O makes no sense.
|
||||
* Direct them through the ARC as uncached I/O.
|
||||
*/
|
||||
if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz)
|
||||
goto out;
|
||||
|
||||
error = zfs_uio_get_dio_pages_alloc(uio, rw);
|
||||
if (error) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
|
||||
ASSERT0(error);
|
||||
error = zfs_uio_get_dio_pages_alloc(uio, rw);
|
||||
if (error)
|
||||
goto out;
|
||||
ASSERT(uio->uio_extflg & UIO_DIRECT);
|
||||
|
||||
out:
|
||||
*ioflagp = ioflag;
|
||||
@@ -392,6 +406,9 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
ssize_t start_resid = n;
|
||||
ssize_t dio_remaining_resid = 0;
|
||||
|
||||
dmu_flags_t dflags = DMU_READ_PREFETCH;
|
||||
if (ioflag & O_DIRECT)
|
||||
dflags |= DMU_UNCACHEDIO;
|
||||
if (uio->uio_extflg & UIO_DIRECT) {
|
||||
/*
|
||||
* All pages for an O_DIRECT request ahve already been mapped
|
||||
@@ -414,6 +431,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
|
||||
if (dio_remaining_resid != 0)
|
||||
n -= dio_remaining_resid;
|
||||
dflags |= DMU_DIRECTIO;
|
||||
}
|
||||
|
||||
while (n > 0) {
|
||||
@@ -429,7 +447,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
error = mappedread(zp, nbytes, uio);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, nbytes);
|
||||
uio, nbytes, dflags);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
@@ -479,15 +497,17 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
* remainder of the file can be read using the ARC.
|
||||
*/
|
||||
uio->uio_extflg &= ~UIO_DIRECT;
|
||||
dflags &= ~DMU_DIRECTIO;
|
||||
|
||||
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
|
||||
zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
|
||||
error = mappedread(zp, dio_remaining_resid, uio);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
|
||||
dio_remaining_resid);
|
||||
dio_remaining_resid, dflags);
|
||||
}
|
||||
uio->uio_extflg |= UIO_DIRECT;
|
||||
dflags |= DMU_DIRECTIO;
|
||||
|
||||
if (error != 0)
|
||||
n += dio_remaining_resid;
|
||||
@@ -859,12 +879,18 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
zfs_rangelock_reduce(lr, woff, n);
|
||||
}
|
||||
|
||||
dmu_flags_t dflags = DMU_READ_PREFETCH;
|
||||
if (ioflag & O_DIRECT)
|
||||
dflags |= DMU_UNCACHEDIO;
|
||||
if (uio->uio_extflg & UIO_DIRECT)
|
||||
dflags |= DMU_DIRECTIO;
|
||||
|
||||
ssize_t tx_bytes;
|
||||
if (abuf == NULL) {
|
||||
tx_bytes = zfs_uio_resid(uio);
|
||||
zfs_uio_fault_disable(uio, B_TRUE);
|
||||
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, nbytes, tx);
|
||||
uio, nbytes, tx, dflags);
|
||||
zfs_uio_fault_disable(uio, B_FALSE);
|
||||
#ifdef __linux__
|
||||
if (error == EFAULT) {
|
||||
@@ -903,7 +929,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
* arc buffer to a dbuf.
|
||||
*/
|
||||
error = dmu_assign_arcbuf_by_dbuf(
|
||||
sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
|
||||
sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* XXX This might not be necessary if
|
||||
@@ -1329,7 +1355,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
||||
error = SET_ERROR(ENOENT);
|
||||
} else {
|
||||
error = dmu_read(os, object, offset, size, buf,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
|
||||
}
|
||||
ASSERT(error == 0 || error == ENOENT);
|
||||
} else { /* indirect write */
|
||||
@@ -2019,3 +2045,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
|
||||
"Enable Direct I/O");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW,
|
||||
"Return errors on misaligned Direct I/O");
|
||||
|
||||
+4
-3
@@ -900,8 +900,9 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
|
||||
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
|
||||
(wr_state == WR_COPIED ? len : 0));
|
||||
lr = (lr_write_t *)&itx->itx_lr;
|
||||
if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
|
||||
offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
|
||||
if (wr_state == WR_COPIED &&
|
||||
dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1,
|
||||
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) {
|
||||
zil_itx_destroy(itx);
|
||||
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
|
||||
lr = (lr_write_t *)&itx->itx_lr;
|
||||
@@ -994,7 +995,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
|
||||
size, RL_READER);
|
||||
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
|
||||
} else { /* indirect write */
|
||||
ASSERT3P(zio, !=, NULL);
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user