From e0ef4d2768dd031c524de1beb3e20a4c1a988f09 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 11 Jun 2025 14:59:16 -0400 Subject: [PATCH] Improve block cloning transactions accounting Previous dmu_tx_count_clone() was broken, stating that cloning is similar to free. While they might be from some points, cloning is not net-free. It will likely consume space and memory, and unlike free it will do it no matter whether the destination has the blocks or not (usually not, so previous code did nothing). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17431 --- include/sys/dmu.h | 2 +- module/zfs/dbuf.c | 3 ++- module/zfs/dmu_tx.c | 43 +++++++++++++++++++++++++++++++++++------- module/zfs/zfs_vnops.c | 5 +++-- module/zfs/zvol.c | 5 +++-- 5 files changed, 45 insertions(+), 13 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index fec7714fd..0b2e443a4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -855,7 +855,7 @@ void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, - int len); + uint64_t len, uint_t blksz); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index fd0909531..f1b5a17f3 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3901,7 +3901,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT3U(dn->dn_nlevels, >, level); + if (!fail_sparse) + ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index c81529a44..b5e9c7e6e 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -547,17 +548,45 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) } static void -dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len, + uint_t blksz) { + dmu_tx_t *tx = txh->txh_tx; + dnode_t *dn = txh->txh_dnode; + int err; - /* - * Reuse dmu_tx_count_free(), it does exactly what we need for clone. - */ - dmu_tx_count_free(txh, off, len); + ASSERT0(tx->tx_txg); + ASSERT(dn->dn_indblkshift != 0); + ASSERT(blksz != 0); + ASSERT0(off % blksz); + + (void) zfs_refcount_add_many(&txh->txh_memory_tohold, + len / blksz * sizeof (brt_entry_t), FTAG); + + int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t start = off / blksz >> shift; + uint64_t end = (off + len) / blksz >> shift; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, + (end - start + 1) << dn->dn_indblkshift, FTAG); + + zio_t *zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + for (uint64_t i = start; i <= end; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + tx->tx_err = err; + break; + } + } + err = zio_wait(zio); + if (err != 0) + tx->tx_err = err; } void -dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + uint64_t len, uint_t blksz) { dmu_tx_hold_t *txh; @@ -567,7 +596,7 @@ dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); - dmu_tx_count_clone(txh, off, len); + dmu_tx_count_clone(txh, off, len, blksz); } } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 9aba525d7..656ca4dc2 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1836,7 +1836,8 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); DB_DNODE_ENTER(db); - dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); + dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size, + inblksz); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, outzp); error = dmu_tx_assign(tx, DMU_TX_WAIT); @@ -2003,7 +2004,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); - dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); + dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len, blksz); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, DMU_TX_WAIT); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index db1f103c0..3568d4f43 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -577,7 +577,7 @@ zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) if (error != 0 || !zv->zv_dn) return (error); tx = dmu_tx_create(os); - dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len); + dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len, blksz); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx); @@ -742,7 +742,8 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, } tx = dmu_tx_create(zv_dst->zv_objset); - dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size); + dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size, + zv_src->zv_volblocksize); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx);