diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 81727224b..533b355fa 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -440,7 +440,7 @@ get_usage(zfs_help_t idx) return (gettext("\tredact " " ...\n")); case HELP_REWRITE: - return (gettext("\trewrite [-rvx] [-o ] [-l ] " + return (gettext("\trewrite [-Prvx] [-o ] [-l ] " "\n")); case HELP_JAIL: return (gettext("\tjail \n")); @@ -9177,8 +9177,11 @@ zfs_do_rewrite(int argc, char **argv) zfs_rewrite_args_t args; memset(&args, 0, sizeof (args)); - while ((c = getopt(argc, argv, "l:o:rvx")) != -1) { + while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) { switch (c) { + case 'P': + args.flags |= ZFS_REWRITE_PHYSICAL; + break; case 'l': args.len = strtoll(optarg, NULL, 0); break; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 756459b2f..baf3b1508 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -164,6 +164,7 @@ typedef struct dbuf_dirty_record { boolean_t dr_nopwrite; boolean_t dr_brtwrite; boolean_t dr_diowrite; + boolean_t dr_rewrite; boolean_t dr_has_raw_params; /* Override and raw params are mutually exclusive. */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 7dc6daaf0..7c2024a16 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -825,6 +825,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags); +void dmu_buf_will_rewrite(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c8deb5be4..fc359c103 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1627,6 +1627,9 @@ typedef struct zfs_rewrite_args { uint64_t arg; } zfs_rewrite_args_t; +/* zfs_rewrite_args flags */ +#define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ + #define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) /* diff --git a/include/sys/zio.h b/include/sys/zio.h index b139c9de4..a33680346 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -374,6 +374,7 @@ typedef struct zio_prop { boolean_t zp_encrypt; boolean_t zp_byteorder; boolean_t zp_direct_write; + boolean_t zp_rewrite; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 4877df4b1..56382ca85 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -89,6 +89,7 @@ typedef enum spa_feature { SPA_FEATURE_LARGE_MICROZAP, SPA_FEATURE_DYNAMIC_GANG_HEADER, SPA_FEATURE_BLOCK_CLONING_ENDIAN, + SPA_FEATURE_PHYSICAL_REWRITE, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 37d22402e..ba161d1ef 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -639,7 +639,7 @@ - + @@ -6399,7 +6399,8 @@ - + + @@ -9614,8 +9615,8 @@ - - + + @@ -9693,7 +9694,7 @@ - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 66aa100b7..10dfd1f92 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -853,6 +853,23 @@ when the command is used on a top-level vdev, and will never return to being .Sy enabled . . +.feature com.truenas physical_rewrite yes extensible_dataset +This feature enables physical block rewriting that preserves logical birth +times, avoiding unnecessary inclusion of rewritten blocks in incremental +.Nm zfs Cm send +streams. +When enabled, the +.Nm zfs Cm rewrite Fl P +command can be used. +.Pp +This feature becomes +.Sy active +the first time +.Nm zfs Cm rewrite Fl P +is used on any dataset, and will return to being +.Sy enabled +once all datasets that have ever used physical rewrite are destroyed. +. .feature org.zfsonlinux project_quota yes extensible_dataset This feature allows administrators to account the spaces and objects usage information against the project identifier diff --git a/man/man8/zfs-rewrite.8 b/man/man8/zfs-rewrite.8 index 423d6d439..a3a037f37 100644 --- a/man/man8/zfs-rewrite.8 +++ b/man/man8/zfs-rewrite.8 @@ -31,7 +31,7 @@ .Sh SYNOPSIS .Nm zfs .Cm rewrite -.Oo Fl rvx Ns Oc +.Oo Fl Prvx Ns Oc .Op Fl l Ar length .Op Fl o Ar offset .Ar file Ns | Ns Ar directory Ns … @@ -43,6 +43,15 @@ as is without modification at a new location and possibly with new properties, such as checksum, compression, dedup, copies, etc, as if they were atomically read and written back. .Bl -tag -width "-r" +.It Fl P +Perform physical rewrite, preserving logical birth time of blocks. +By default, rewrite updates logical birth times, making blocks appear +as modified in snapshots and incremental send streams. +Physical rewrite preserves logical birth times, avoiding unnecessary +inclusion in incremental streams. +Physical rewrite requires the +.Sy physical_rewrite +feature to be enabled on the pool. .It Fl l Ar length Rewrite at most this number of bytes. .It Fl o Ar offset @@ -60,17 +69,22 @@ same as some property changes may increase pool space usage. Holes that were never written or were previously zero-compressed are not rewritten and will remain holes even if compression is disabled. .Pp -Rewritten blocks will be seen as modified in next snapshot and as such -included into the incremental -.Nm zfs Cm send -stream. -.Pp If a .Fl l or .Fl o value request a rewrite to regions past the end of the file, then those regions are silently ignored, and no error is reported. +.Pp +By default, rewritten blocks update their logical birth time, +meaning they will be included in incremental +.Nm zfs Cm send +streams as modified data. +When the +.Fl P +flag is used, rewritten blocks preserve their logical birth time, since +there are no user data changes. . .Sh SEE ALSO -.Xr zfsprops 7 +.Xr zfsprops 7 , +.Xr zpool-features 7 diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 0b37530b0..6ba9892ee 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -798,6 +798,18 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t physical_rewrite_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_PHYSICAL_REWRITE, + "com.truenas:physical_rewrite", "physical_rewrite", + "Support for preserving logical birth time during rewrite.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, + ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index a96666a46..432c99cec 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2160,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr) ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + + /* + * Clear the rewrite flag since this is now a logical + * modification. + */ + dr->dt.dl.dr_rewrite = B_FALSE; } } @@ -2707,6 +2713,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); } +void +dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(tx->tx_txg != 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + /* + * If the dbuf is already dirty in this txg, it will be written + * anyway, so there's nothing to do. + */ + mutex_enter(&db->db_mtx); + if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + return; + } + mutex_exit(&db->db_mtx); + + /* + * The dbuf is not dirty, so we need to make it dirty and + * mark it for rewrite (preserve logical birth time). + */ + dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); + + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (dr != NULL && db->db_level == 0) + dr->dt.dl.dr_rewrite = B_TRUE; + mutex_exit(&db->db_mtx); +} + boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -5338,6 +5376,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + /* + * Set rewrite properties for zfs_rewrite() operations. + */ + if (db->db_level == 0 && dr->dt.dl.dr_rewrite) { + zp.zp_rewrite = B_TRUE; + + /* + * Mark physical rewrite feature for activation. + * This will be activated automatically during dataset sync. + */ + dsl_dataset_t *ds = os->os_dsl_dataset; + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_PHYSICAL_REWRITE)) { + ds->ds_feature_activation[ + SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE; + } + } + /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and @@ -5408,6 +5464,7 @@ EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); +EXPORT_SYMBOL(dmu_buf_will_rewrite); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 690227a30..296e58ef9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2508,6 +2508,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; + zp->zp_rewrite = B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 8ad992f5b..74aa91a4f 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -1101,13 +1102,21 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, { int error; - if (flags != 0 || arg != 0) + if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) return (SET_ERROR(EINVAL)); zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); + /* Check if physical rewrite is allowed */ + spa_t *spa = zfsvfs->z_os->os_spa; + if ((flags & ZFS_REWRITE_PHYSICAL) && + !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); @@ -1195,7 +1204,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, if (dmu_buf_is_dirty(dbp[i], tx)) continue; nw += dbp[i]->db_size; - dmu_buf_will_dirty(dbp[i], tx); + if (flags & ZFS_REWRITE_PHYSICAL) + dmu_buf_will_rewrite(dbp[i], tx); + else + dmu_buf_will_dirty(dbp[i], tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 41e0dc000..218aec609 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3923,6 +3923,23 @@ zio_ddt_write(zio_t *zio) * then we can just use them as-is. */ if (have_dvas >= need_dvas) { + /* + * For rewrite operations, try preserving the original + * logical birth time. If the result matches the + * original BP, this becomes a NOP. + */ + if (zp->zp_rewrite) { + uint64_t orig_logical_birth = + BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig); + ddt_bp_fill(ddp, v, bp, orig_logical_birth); + if (BP_EQUAL(bp, &zio->io_bp_orig)) { + /* We can skip accounting. */ + zio->io_flags |= ZIO_FLAG_NOPWRITE; + ddt_exit(ddt); + return (zio); + } + } + ddt_bp_fill(ddp, v, bp, txg); ddt_phys_addref(ddp, v); ddt_exit(ddt); @@ -4355,6 +4372,15 @@ again: error); } zio->io_error = error; + } else if (zio->io_prop.zp_rewrite) { + /* + * For rewrite operations, preserve the logical birth time + * but set the physical birth time to the current txg. + */ + uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig); + ASSERT3U(logical_birth, <=, zio->io_txg); + BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg); + BP_SET_REWRITE(zio->io_bp, 1); } return (zio); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index deca3c05b..9fad8946f 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -308,7 +308,7 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rewrite] -tests = ['zfs_rewrite'] +tests = ['zfs_rewrite', 'zfs_rewrite_physical'] tags = ['functional', 'cli_root', 'zfs_rewrite'] [tests/functional/cli_root/zfs_rollback] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 732f252b5..7767c0c2d 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -195,7 +195,7 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rewrite] -tests = ['zfs_rewrite'] +tests = ['zfs_rewrite', 'zfs_rewrite_physical'] tags = ['functional', 'cli_root', 'zfs_rewrite'] [tests/functional/cli_root/zfs_rollback] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 5ab28b2d6..c2542287c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -869,6 +869,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_rewrite/cleanup.ksh \ functional/cli_root/zfs_rewrite/setup.ksh \ functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \ + functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh new file mode 100755 index 000000000..142e44f53 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh @@ -0,0 +1,100 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, iXsystems, Inc. +# + +# DESCRIPTION: +# Verify zfs rewrite -P flag correctly preserves logical birth times. +# +# STRATEGY: +# 1. Create a test file and sync it. +# 2. Create a snapshot to capture the original birth time. +# 3. Test default rewrite behavior (updates logical birth time). +# 4. Test -P flag behavior (preserves logical birth time). +# 5. Verify incremental send behavior difference. + +. $STF_SUITE/include/libtest.shlib + +typeset tmp=$(mktemp) +typeset send_default=$(mktemp) +typeset send_physical=$(mktemp) + +function cleanup +{ + rm -rf $tmp $send_default $send_physical $TESTDIR/* + zfs destroy -R $TESTPOOL/$TESTFS@snap1 2>/dev/null || true + zfs destroy -R $TESTPOOL/$TESTFS@snap2 2>/dev/null || true + zfs destroy -R $TESTPOOL/$TESTFS@snap3 2>/dev/null || true +} + +log_assert "zfs rewrite -P flag correctly preserves logical birth times" + +log_onexit cleanup + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +# Create test file and initial snapshot +log_must dd if=/dev/urandom of=$TESTDIR/testfile bs=128k count=4 +log_must sync_pool $TESTPOOL +typeset orig_hash=$(xxh128digest $TESTDIR/testfile) +log_must zfs snapshot $TESTPOOL/$TESTFS@snap1 + +# Test default rewrite behavior (updates logical birth time) +log_must zfs rewrite $TESTDIR/testfile +log_must sync_pool $TESTPOOL +typeset default_hash=$(xxh128digest $TESTDIR/testfile) +log_must [ "$orig_hash" = "$default_hash" ] +log_must zfs snapshot $TESTPOOL/$TESTFS@snap2 + +# Test incremental send size - should be large with updated birth time +log_must eval "zfs send -i @snap1 $TESTPOOL/$TESTFS@snap2 > $send_default" +typeset default_size=$(wc -c < $send_default) +log_note "Default rewrite incremental send size: $default_size bytes" + +# Reset the file to original state +log_must zfs rollback -r $TESTPOOL/$TESTFS@snap1 + +# Test -P flag behavior (preserves logical birth time) +log_must zfs rewrite -P $TESTDIR/testfile +log_must sync_pool $TESTPOOL +typeset physical_hash=$(xxh128digest $TESTDIR/testfile) +log_must [ "$orig_hash" = "$physical_hash" ] +log_must zfs snapshot $TESTPOOL/$TESTFS@snap3 + +# Test incremental send size - should be minimal with preserved birth time +log_must eval "zfs send -i @snap1 $TESTPOOL/$TESTFS@snap3 > $send_physical" +typeset physical_size=$(wc -c < $send_physical) +log_note "Physical rewrite incremental send size: $physical_size bytes" + +# Verify that -P flag produces smaller incremental send +if [[ $physical_size -lt $default_size ]]; then + log_note "SUCCESS: -P flag produces smaller incremental send" \ + "($physical_size < $default_size)" +else + log_fail "FAIL: -P flag should produce smaller incremental send" \ + "($physical_size >= $default_size)" +fi + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 3389dcf72..bdf5fdf85 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -92,6 +92,7 @@ typeset -a properties=( "feature@draid" "feature@redaction_list_spill" "feature@dynamic_gang_header" + "feature@physical_rewrite" ) if is_linux || is_freebsd; then