Implement physical rewrites

Based on previous commit this implements `zfs rewrite -P` flag,
making ZFS to keep blocks logical birth times while rewriting
files.  It should exclude the rewritten blocks from incremental
sends, snapshot diffs, etc.  Snapshots space usage same time will
reflect the additional space usage from newly allocated blocks.

Since this begins to use new "rewrite" flag in the block pointers,
this commit introduces a new read-compatible per-dataset feature
physical_rewrite.  It must be enabled for the command to not fail,
it is activated on first use and deactivated on deletion of the
last affected dataset.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:  Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17565
This commit is contained in:
Alexander Motin
2025-07-23 15:51:00 -04:00
committed by Brian Behlendorf
parent 4ae8bf406b
commit 60f714e6e2
19 changed files with 270 additions and 18 deletions
+12
View File
@@ -798,6 +798,18 @@ zpool_feature_init(void)
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE,
ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
{
static const spa_feature_t physical_rewrite_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_PHYSICAL_REWRITE,
"com.truenas:physical_rewrite", "physical_rewrite",
"Support for preserving logical birth time during rewrite.",
ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures);
}
zfs_mod_list_supported_free(sfeatures);
}
+57
View File
@@ -2160,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
ASSERT(arc_released(db->db_buf));
arc_buf_thaw(db->db_buf);
}
/*
* Clear the rewrite flag since this is now a logical
* modification.
*/
dr->dt.dl.dr_rewrite = B_FALSE;
}
}
@@ -2707,6 +2713,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
}
void
dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT(tx->tx_txg != 0);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
/*
* If the dbuf is already dirty in this txg, it will be written
* anyway, so there's nothing to do.
*/
mutex_enter(&db->db_mtx);
if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
mutex_exit(&db->db_mtx);
return;
}
mutex_exit(&db->db_mtx);
/*
* The dbuf is not dirty, so we need to make it dirty and
* mark it for rewrite (preserve logical birth time).
*/
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
mutex_enter(&db->db_mtx);
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
if (dr != NULL && db->db_level == 0)
dr->dt.dl.dr_rewrite = B_TRUE;
mutex_exit(&db->db_mtx);
}
boolean_t
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
@@ -5338,6 +5376,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
/*
* Set rewrite properties for zfs_rewrite() operations.
*/
if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {
zp.zp_rewrite = B_TRUE;
/*
* Mark physical rewrite feature for activation.
* This will be activated automatically during dataset sync.
*/
dsl_dataset_t *ds = os->os_dsl_dataset;
if (!dsl_dataset_feature_is_active(ds,
SPA_FEATURE_PHYSICAL_REWRITE)) {
ds->ds_feature_activation[
SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;
}
}
/*
* We copy the blkptr now (rather than when we instantiate the dirty
* record), because its value can change between open context and
@@ -5408,6 +5464,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_will_rewrite);
EXPORT_SYMBOL(dmu_buf_is_dirty);
EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
+1
View File
@@ -2508,6 +2508,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
zp->zp_rewrite = B_FALSE;
memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
+14 -2
View File
@@ -49,6 +49,7 @@
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_crypt.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
@@ -1101,13 +1102,21 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
{
int error;
if (flags != 0 || arg != 0)
if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0)
return (SET_ERROR(EINVAL));
zfsvfs_t *zfsvfs = ZTOZSB(zp);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
/* Check if physical rewrite is allowed */
spa_t *spa = zfsvfs->z_os->os_spa;
if ((flags & ZFS_REWRITE_PHYSICAL) &&
!spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (zfs_is_readonly(zfsvfs)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EROFS));
@@ -1195,7 +1204,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
if (dmu_buf_is_dirty(dbp[i], tx))
continue;
nw += dbp[i]->db_size;
dmu_buf_will_dirty(dbp[i], tx);
if (flags & ZFS_REWRITE_PHYSICAL)
dmu_buf_will_rewrite(dbp[i], tx);
else
dmu_buf_will_dirty(dbp[i], tx);
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
+26
View File
@@ -3923,6 +3923,23 @@ zio_ddt_write(zio_t *zio)
* then we can just use them as-is.
*/
if (have_dvas >= need_dvas) {
/*
* For rewrite operations, try preserving the original
* logical birth time. If the result matches the
* original BP, this becomes a NOP.
*/
if (zp->zp_rewrite) {
uint64_t orig_logical_birth =
BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
ddt_bp_fill(ddp, v, bp, orig_logical_birth);
if (BP_EQUAL(bp, &zio->io_bp_orig)) {
/* We can skip accounting. */
zio->io_flags |= ZIO_FLAG_NOPWRITE;
ddt_exit(ddt);
return (zio);
}
}
ddt_bp_fill(ddp, v, bp, txg);
ddt_phys_addref(ddp, v);
ddt_exit(ddt);
@@ -4355,6 +4372,15 @@ again:
error);
}
zio->io_error = error;
} else if (zio->io_prop.zp_rewrite) {
/*
* For rewrite operations, preserve the logical birth time
* but set the physical birth time to the current txg.
*/
uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
ASSERT3U(logical_birth, <=, zio->io_txg);
BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg);
BP_SET_REWRITE(zio->io_bp, 1);
}
return (zio);