From 2646bd558533452c015d124b4d149b1dfbb40ce0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 9 Feb 2026 13:17:56 -0500 Subject: [PATCH] Allow rewrite skip cloned and snapshotted blocks Rewrite of cloned and snapshotted blocks can allocate additional space, that may be undesired. In some cases it may have sense to still rewrite snapshotted blocks, expecting the snapshots to rotate with time, freeing space. In other cases rewrite of cloned blocks may be acceptable, despite persistent space usage increase. For this reason add them as separate flags to `zfs rewrite`. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Reviewed-by: Ameer Hamza Signed-off-by: Alexander Motin Closes #18179 --- cmd/zfs/zfs_main.c | 12 ++- include/sys/dmu_objset.h | 1 + include/sys/fs/zfs.h | 4 +- man/man8/zfs-rewrite.8 | 19 ++++- module/zfs/dmu_objset.c | 19 +++++ module/zfs/zfs_vnops.c | 39 ++++++++- tests/runfiles/common.run | 3 +- tests/runfiles/sanity.run | 3 +- tests/zfs-tests/include/libtest.shlib | 22 +++++ tests/zfs-tests/tests/Makefile.am | 2 + .../block_cloning/block_cloning.kshlib | 24 ------ .../zfs_rewrite/zfs_rewrite_skip_clone.ksh | 83 +++++++++++++++++++ .../zfs_rewrite/zfs_rewrite_skip_snapshot.ksh | 74 +++++++++++++++++ 13 files changed, 273 insertions(+), 32 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_clone.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_snapshot.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 4152bbf8d..1917a0f39 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -439,8 +439,8 @@ get_usage(zfs_help_t idx) return (gettext("\tredact " " ...\n")); case HELP_REWRITE: - return (gettext("\trewrite [-Prvx] [-o ] [-l ] " - "\n")); + return (gettext("\trewrite [-CPSrvx] [-o ] " + "[-l ] \n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: @@ -9080,11 +9080,17 @@ zfs_do_rewrite(int argc, char **argv) zfs_rewrite_args_t args; memset(&args, 0, sizeof (args)); - while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) { + while ((c = getopt(argc, argv, "CPSl:o:rvx")) != -1) { switch (c) { + case 'C': + args.flags |= ZFS_REWRITE_SKIP_BRT; + break; case 'P': args.flags |= ZFS_REWRITE_PHYSICAL; break; + case 'S': + args.flags |= ZFS_REWRITE_SKIP_SNAPSHOT; + break; case 'l': args.len = strtoll(optarg, NULL, 0); break; diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 492be2920..a7653582f 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -236,6 +236,7 @@ int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, void *arg, int flags); void dmu_objset_evict_dbufs(objset_t *os); inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); +boolean_t dmu_objset_block_is_shared(objset_t *os, const blkptr_t *bp); /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 507d1fa2d..ca929ed51 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1638,7 +1638,9 @@ typedef struct zfs_rewrite_args { } zfs_rewrite_args_t; /* zfs_rewrite_args flags */ -#define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ +#define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ +#define ZFS_REWRITE_SKIP_SNAPSHOT 0x2 /* Skip snapshot-shared blocks. */ +#define ZFS_REWRITE_SKIP_BRT 0x4 /* Skip BRT-cloned blocks. */ #define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) diff --git a/man/man8/zfs-rewrite.8 b/man/man8/zfs-rewrite.8 index ae0a15882..daec277d8 100644 --- a/man/man8/zfs-rewrite.8 +++ b/man/man8/zfs-rewrite.8 @@ -32,7 +32,7 @@ .Sh SYNOPSIS .Nm zfs .Cm rewrite -.Oo Fl Prvx Ns Oc +.Oo Fl CPSrvx Ns Oc .Op Fl l Ar length .Op Fl o Ar offset .Ar file Ns | Ns Ar directory Ns … @@ -45,6 +45,11 @@ as if they were atomically read and written back. .No See Sx NOTES . for more information about property changes that may be applied during rewrite. .Bl -tag -width "-r" +.It Fl C +Skip blocks that are shared via block cloning (BRT). +Cloned blocks are referenced by multiple files or datasets. +Rewriting these blocks would create separate copies and increase space usage. +This flag prevents such expansion by skipping cloned blocks. .It Fl P Perform physical rewrite, preserving logical birth time of blocks. By default, rewrite updates logical birth times, making blocks appear @@ -54,6 +59,12 @@ inclusion in incremental streams. Physical rewrite requires the .Sy physical_rewrite feature to be enabled on the pool. +.It Fl S +Skip blocks that are shared with snapshots. +Blocks created before the most recent snapshot are shared with that snapshot. +Rewriting these blocks would create new copies, leaving the old copies for +the snapshot and increasing space usage. +This flag prevents such expansion by skipping snapshot-shared blocks. .It Fl l Ar length Rewrite at most this number of bytes. .It Fl o Ar offset @@ -82,6 +93,12 @@ will have no effect. .Pp Rewrite of cloned blocks and blocks that are part of any snapshots, same as some property changes may increase pool space usage. +Use the +.Fl C +and +.Fl S +flags to skip cloned and snapshot-shared blocks respectively to prevent +this expansion. Holes that were never written or were previously zero-compressed are not rewritten and will remain holes even if compression is disabled. .Pp diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 5a815b59e..ed81647bc 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -3046,6 +3046,24 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } +/* + * Check if a block is shared with a snapshot in this objset. + * Returns B_TRUE if block was created before or at the time of the + * previous snapshot, B_FALSE otherwise. + */ +boolean_t +dmu_objset_block_is_shared(objset_t *os, const blkptr_t *bp) +{ + if (BP_IS_HOLE(bp)) + return (B_FALSE); + + dsl_dataset_t *ds = os->os_dsl_dataset; + if (ds == NULL) + return (B_FALSE); + + return (BP_GET_BIRTH(bp) <= dsl_dataset_phys(ds)->ds_prev_snap_txg); +} + #if defined(_KERNEL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); @@ -3090,4 +3108,5 @@ EXPORT_SYMBOL(dmu_objset_projectquota_enabled); EXPORT_SYMBOL(dmu_objset_projectquota_present); EXPORT_SYMBOL(dmu_objset_projectquota_upgradable); EXPORT_SYMBOL(dmu_objset_id_quota_upgrade); +EXPORT_SYMBOL(dmu_objset_block_is_shared); #endif diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 7bb9ba57c..8f29474ac 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -1095,6 +1096,34 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (0); } +/* + * Check if a block should be skipped during rewrite. + * Returns B_TRUE if block should be skipped. + */ +static boolean_t +zfs_rewrite_skip(dmu_buf_t *db, objset_t *os, uint64_t flags) +{ + /* + * This may be slightly stale and racy, but should be OK for + * the advisory use. + */ + blkptr_t *bp = dmu_buf_get_blkptr(db); + if (bp == NULL) + return (B_TRUE); + + if (flags & ZFS_REWRITE_SKIP_SNAPSHOT) { + if (dmu_objset_block_is_shared(os, bp)) + return (B_TRUE); + } + + if (flags & ZFS_REWRITE_SKIP_BRT) { + if (brt_maybe_exists(os->os_spa, bp)) + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Rewrite a range of file as-is without modification. * @@ -1113,7 +1142,11 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, { int error; - if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) +#define ZFS_REWRITE_VALID_FLAGS \ + (ZFS_REWRITE_PHYSICAL | ZFS_REWRITE_SKIP_SNAPSHOT | \ + ZFS_REWRITE_SKIP_BRT) + + if ((flags & ~ZFS_REWRITE_VALID_FLAGS) != 0 || arg != 0) return (SET_ERROR(EINVAL)); zfsvfs_t *zfsvfs = ZTOZSB(zp); @@ -1214,6 +1247,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, nr += dbp[i]->db_size; if (dmu_buf_is_dirty(dbp[i], tx)) continue; + + if (zfs_rewrite_skip(dbp[i], zfsvfs->z_os, flags)) + continue; + nw += dbp[i]->db_size; if (flags & ZFS_REWRITE_PHYSICAL) dmu_buf_will_rewrite(dbp[i], tx); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b83f50a4b..19df29ec3 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -309,7 +309,8 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rewrite] -tests = ['zfs_rewrite', 'zfs_rewrite_physical'] +tests = ['zfs_rewrite', 'zfs_rewrite_physical', 'zfs_rewrite_skip_clone', + 'zfs_rewrite_skip_snapshot'] tags = ['functional', 'cli_root', 'zfs_rewrite'] [tests/functional/cli_root/zfs_rollback] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 95bdf05dc..dad51d2e9 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -195,7 +195,8 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rewrite] -tests = ['zfs_rewrite', 'zfs_rewrite_physical'] +tests = ['zfs_rewrite', 'zfs_rewrite_physical', 'zfs_rewrite_skip_clone', + 'zfs_rewrite_skip_snapshot'] tags = ['functional', 'cli_root', 'zfs_rewrite'] [tests/functional/cli_root/zfs_rollback] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 3d697726a..096d6b18e 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3943,4 +3943,26 @@ function pop_coredump_pattern esac } +# +# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2 [key] +# +# Returns a space-separated list of the indexes (starting at 0) of the L0 +# blocks that are shared between both files (by first DVA and checksum). +# +function get_same_blocks # dataset1 file1 dataset2 file2 [key] +{ + typeset KEY=$5 + if [ ${#KEY} -gt 0 ]; then + KEY="--key=$KEY" + fi + typeset zdbout1=$(mktemp) + typeset zdbout2=$(mktemp) + zdb $KEY -vvvvv $1 -O $2 | \ + awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout1 + zdb $KEY -vvvvv $3 -O $4 | \ + awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout2 + echo $(sort -n $zdbout1 $zdbout2 | uniq -d | cut -f1 -d' ') + rm -f $zdbout1 $zdbout2 +} + . ${STF_SUITE}/include/kstat.shlib diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 19b31d2ea..9bb39b012 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -876,6 +876,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_rewrite/setup.ksh \ functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \ functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \ + functional/cli_root/zfs_rewrite/zfs_rewrite_skip_clone.ksh \ + functional/cli_root/zfs_rewrite/zfs_rewrite_skip_snapshot.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib index b2a2b09f5..67f4b109b 100644 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -35,27 +35,3 @@ function have_same_content log_must [ "$hash1" = "$hash2" ] } -# -# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2 -# -# Returns a space-separated list of the indexes (starting at 0) of the L0 -# blocks that are shared between both files (by first DVA and checksum). -# Assumes that the two files have the same content, use have_same_content to -# confirm that. -# -function get_same_blocks -{ - KEY=$5 - if [ ${#KEY} -gt 0 ]; then - KEY="--key=$KEY" - fi - typeset zdbout1=$(mktemp) - typeset zdbout2=$(mktemp) - zdb $KEY -vvvvv $1 -O $2 | \ - awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout1 - zdb $KEY -vvvvv $3 -O $4 | \ - awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout2 - echo $(sort -n $zdbout1 $zdbout2 | uniq -d | cut -f1 -d' ') - rm -f $zdbout1 $zdbout2 -} - diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_clone.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_clone.ksh new file mode 100755 index 000000000..464f41535 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_clone.ksh @@ -0,0 +1,83 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, iXsystems, Inc. +# + +# DESCRIPTION: +# Verify zfs rewrite -C flag skips BRT-cloned blocks. +# +# STRATEGY: +# 1. Create a test file and sync it. +# 2. Clone the file using block cloning to share blocks via BRT. +# 3. Rewrite clone with -C flag and verify blocks are NOT rewritten. +# 4. Rewrite clone without -C flag and verify blocks ARE rewritten. + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +function cleanup +{ + rm -rf $TESTDIR/* +} + +log_assert "zfs rewrite -C flag skips BRT-cloned blocks" + +log_onexit cleanup + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +# Create source file (4 x 128KB = 4 blocks) +log_must dd if=/dev/urandom of=$TESTDIR/source bs=128k count=4 +log_must sync_pool $TESTPOOL + +# Clone the file using block cloning +log_must clonefile -f $TESTDIR/source $TESTDIR/clone +log_must sync_pool $TESTPOOL + +# Verify blocks are actually shared initially +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS source \ + $TESTPOOL/$TESTFS clone) +log_must [ "$blocks" = "0 1 2 3" ] + +# Test 1: Rewrite clone WITH -C flag (should skip all cloned blocks) +log_must zfs rewrite -C $TESTDIR/clone +log_must sync_pool $TESTPOOL + +# Blocks should still be shared (all blocks were skipped) +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS source \ + $TESTPOOL/$TESTFS clone) +log_must [ "$blocks" = "0 1 2 3" ] + +# Test 2: Rewrite clone WITHOUT -C flag (should rewrite all blocks) +log_must zfs rewrite $TESTDIR/clone +log_must sync_pool $TESTPOOL + +# No blocks should be shared (clone has new blocks) +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS source \ + $TESTPOOL/$TESTFS clone) +log_must [ -z "$blocks" ] + +log_pass diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_snapshot.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_snapshot.ksh new file mode 100755 index 000000000..9fc039ba2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_skip_snapshot.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2026, iXsystems, Inc. +# + +# DESCRIPTION: +# Verify zfs rewrite -S flag skips snapshot-shared blocks. +# +# STRATEGY: +# 1. Create a test file and sync it. +# 2. Take a snapshot to share the blocks. +# 3. Rewrite with -S flag and verify blocks are NOT rewritten. +# 4. Rewrite without -S flag and verify blocks ARE rewritten. + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + rm -rf $TESTDIR/* + zfs destroy -R $TESTPOOL/$TESTFS@snap1 2>/dev/null || true +} + +log_assert "zfs rewrite -S flag skips snapshot-shared blocks" + +log_onexit cleanup + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +# Create test file (4 x 128KB = 4 blocks) and snapshot +log_must dd if=/dev/urandom of=$TESTDIR/testfile bs=128k count=4 +log_must sync_pool $TESTPOOL +log_must zfs snapshot $TESTPOOL/$TESTFS@snap1 + +# Test 1: Rewrite WITH -S flag (should skip all snapshot-shared blocks) +log_must zfs rewrite -S $TESTDIR/testfile +log_must sync_pool $TESTPOOL + +# All blocks should still be shared (all blocks were skipped) +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS testfile \ + $TESTPOOL/$TESTFS@snap1 testfile) +log_must [ "$blocks" = "0 1 2 3" ] + +# Test 2: Rewrite WITHOUT -S flag (should rewrite all blocks) +log_must zfs rewrite $TESTDIR/testfile +log_must sync_pool $TESTPOOL + +# No blocks should be shared (all blocks were rewritten) +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS testfile \ + $TESTPOOL/$TESTFS@snap1 testfile) +log_must [ -z "$blocks" ] + +log_pass