Allow rewrite skip cloned and snapshotted blocks

Rewrite of cloned and snapshotted blocks can allocate additional
space, that may be undesired.  In some cases it may have sense
to still rewrite snapshotted blocks, expecting the snapshots to
rotate with time, freeing space.  In other cases rewrite of cloned
blocks may be acceptable, despite persistent space usage increase.
For this reason add them as separate flags to `zfs rewrite`.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18179
This commit is contained in:
Alexander Motin 2026-02-09 13:17:56 -05:00 committed by GitHub
parent 15fbf534c6
commit 2646bd5585
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 273 additions and 32 deletions

View File

@ -439,8 +439,8 @@ get_usage(zfs_help_t idx)
return (gettext("\tredact <snapshot> <bookmark> " return (gettext("\tredact <snapshot> <bookmark> "
"<redaction_snapshot> ...\n")); "<redaction_snapshot> ...\n"));
case HELP_REWRITE: case HELP_REWRITE:
return (gettext("\trewrite [-Prvx] [-o <offset>] [-l <length>] " return (gettext("\trewrite [-CPSrvx] [-o <offset>] "
"<directory|file ...>\n")); "[-l <length>] <directory|file ...>\n"));
case HELP_JAIL: case HELP_JAIL:
return (gettext("\tjail <jailid|jailname> <filesystem>\n")); return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
case HELP_UNJAIL: case HELP_UNJAIL:
@ -9080,11 +9080,17 @@ zfs_do_rewrite(int argc, char **argv)
zfs_rewrite_args_t args; zfs_rewrite_args_t args;
memset(&args, 0, sizeof (args)); memset(&args, 0, sizeof (args));
while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) { while ((c = getopt(argc, argv, "CPSl:o:rvx")) != -1) {
switch (c) { switch (c) {
case 'C':
args.flags |= ZFS_REWRITE_SKIP_BRT;
break;
case 'P': case 'P':
args.flags |= ZFS_REWRITE_PHYSICAL; args.flags |= ZFS_REWRITE_PHYSICAL;
break; break;
case 'S':
args.flags |= ZFS_REWRITE_SKIP_SNAPSHOT;
break;
case 'l': case 'l':
args.len = strtoll(optarg, NULL, 0); args.len = strtoll(optarg, NULL, 0);
break; break;

View File

@ -236,6 +236,7 @@ int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
void *arg, int flags); void *arg, int flags);
void dmu_objset_evict_dbufs(objset_t *os); void dmu_objset_evict_dbufs(objset_t *os);
inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
boolean_t dmu_objset_block_is_shared(objset_t *os, const blkptr_t *bp);
/* called from dsl */ /* called from dsl */
void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);

View File

@ -1638,7 +1638,9 @@ typedef struct zfs_rewrite_args {
} zfs_rewrite_args_t; } zfs_rewrite_args_t;
/* zfs_rewrite_args flags */ /* zfs_rewrite_args flags */
#define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ #define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */
#define ZFS_REWRITE_SKIP_SNAPSHOT 0x2 /* Skip snapshot-shared blocks. */
#define ZFS_REWRITE_SKIP_BRT 0x4 /* Skip BRT-cloned blocks. */
#define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) #define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t)

View File

@ -32,7 +32,7 @@
.Sh SYNOPSIS .Sh SYNOPSIS
.Nm zfs .Nm zfs
.Cm rewrite .Cm rewrite
.Oo Fl Prvx Ns Oc .Oo Fl CPSrvx Ns Oc
.Op Fl l Ar length .Op Fl l Ar length
.Op Fl o Ar offset .Op Fl o Ar offset
.Ar file Ns | Ns Ar directory Ns .Ar file Ns | Ns Ar directory Ns
@ -45,6 +45,11 @@ as if they were atomically read and written back.
.No See Sx NOTES . .No See Sx NOTES .
for more information about property changes that may be applied during rewrite. for more information about property changes that may be applied during rewrite.
.Bl -tag -width "-r" .Bl -tag -width "-r"
.It Fl C
Skip blocks that are shared via block cloning (BRT).
Cloned blocks are referenced by multiple files or datasets.
Rewriting these blocks would create separate copies and increase space usage.
This flag prevents such expansion by skipping cloned blocks.
.It Fl P .It Fl P
Perform physical rewrite, preserving logical birth time of blocks. Perform physical rewrite, preserving logical birth time of blocks.
By default, rewrite updates logical birth times, making blocks appear By default, rewrite updates logical birth times, making blocks appear
@ -54,6 +59,12 @@ inclusion in incremental streams.
Physical rewrite requires the Physical rewrite requires the
.Sy physical_rewrite .Sy physical_rewrite
feature to be enabled on the pool. feature to be enabled on the pool.
.It Fl S
Skip blocks that are shared with snapshots.
Blocks created before the most recent snapshot are shared with that snapshot.
Rewriting these blocks would create new copies, leaving the old copies for
the snapshot and increasing space usage.
This flag prevents such expansion by skipping snapshot-shared blocks.
.It Fl l Ar length .It Fl l Ar length
Rewrite at most this number of bytes. Rewrite at most this number of bytes.
.It Fl o Ar offset .It Fl o Ar offset
@ -82,6 +93,12 @@ will have no effect.
.Pp .Pp
Rewrite of cloned blocks and blocks that are part of any snapshots, Rewrite of cloned blocks and blocks that are part of any snapshots,
same as some property changes may increase pool space usage. same as some property changes may increase pool space usage.
Use the
.Fl C
and
.Fl S
flags to skip cloned and snapshot-shared blocks respectively to prevent
this expansion.
Holes that were never written or were previously zero-compressed are Holes that were never written or were previously zero-compressed are
not rewritten and will remain holes even if compression is disabled. not rewritten and will remain holes even if compression is disabled.
.Pp .Pp

View File

@ -3046,6 +3046,24 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
} }
/*
* Check if a block is shared with a snapshot in this objset.
* Returns B_TRUE if block was created before or at the time of the
* previous snapshot, B_FALSE otherwise.
*/
boolean_t
dmu_objset_block_is_shared(objset_t *os, const blkptr_t *bp)
{
if (BP_IS_HOLE(bp))
return (B_FALSE);
dsl_dataset_t *ds = os->os_dsl_dataset;
if (ds == NULL)
return (B_FALSE);
return (BP_GET_BIRTH(bp) <= dsl_dataset_phys(ds)->ds_prev_snap_txg);
}
#if defined(_KERNEL) #if defined(_KERNEL)
EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_zil);
EXPORT_SYMBOL(dmu_objset_pool); EXPORT_SYMBOL(dmu_objset_pool);
@ -3090,4 +3108,5 @@ EXPORT_SYMBOL(dmu_objset_projectquota_enabled);
EXPORT_SYMBOL(dmu_objset_projectquota_present); EXPORT_SYMBOL(dmu_objset_projectquota_present);
EXPORT_SYMBOL(dmu_objset_projectquota_upgradable); EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);
EXPORT_SYMBOL(dmu_objset_id_quota_upgrade); EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);
EXPORT_SYMBOL(dmu_objset_block_is_shared);
#endif #endif

View File

@ -53,6 +53,7 @@
#include <sys/dsl_dataset.h> #include <sys/dsl_dataset.h>
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/txg.h> #include <sys/txg.h>
#include <sys/brt.h>
#include <sys/dbuf.h> #include <sys/dbuf.h>
#include <sys/policy.h> #include <sys/policy.h>
#include <sys/zfeature.h> #include <sys/zfeature.h>
@ -1095,6 +1096,34 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
return (0); return (0);
} }
/*
* Check if a block should be skipped during rewrite.
* Returns B_TRUE if block should be skipped.
*/
static boolean_t
zfs_rewrite_skip(dmu_buf_t *db, objset_t *os, uint64_t flags)
{
/*
* This may be slightly stale and racy, but should be OK for
* the advisory use.
*/
blkptr_t *bp = dmu_buf_get_blkptr(db);
if (bp == NULL)
return (B_TRUE);
if (flags & ZFS_REWRITE_SKIP_SNAPSHOT) {
if (dmu_objset_block_is_shared(os, bp))
return (B_TRUE);
}
if (flags & ZFS_REWRITE_SKIP_BRT) {
if (brt_maybe_exists(os->os_spa, bp))
return (B_TRUE);
}
return (B_FALSE);
}
/* /*
* Rewrite a range of file as-is without modification. * Rewrite a range of file as-is without modification.
* *
@ -1113,7 +1142,11 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
{ {
int error; int error;
if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) #define ZFS_REWRITE_VALID_FLAGS \
(ZFS_REWRITE_PHYSICAL | ZFS_REWRITE_SKIP_SNAPSHOT | \
ZFS_REWRITE_SKIP_BRT)
if ((flags & ~ZFS_REWRITE_VALID_FLAGS) != 0 || arg != 0)
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
zfsvfs_t *zfsvfs = ZTOZSB(zp); zfsvfs_t *zfsvfs = ZTOZSB(zp);
@ -1214,6 +1247,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
nr += dbp[i]->db_size; nr += dbp[i]->db_size;
if (dmu_buf_is_dirty(dbp[i], tx)) if (dmu_buf_is_dirty(dbp[i], tx))
continue; continue;
if (zfs_rewrite_skip(dbp[i], zfsvfs->z_os, flags))
continue;
nw += dbp[i]->db_size; nw += dbp[i]->db_size;
if (flags & ZFS_REWRITE_PHYSICAL) if (flags & ZFS_REWRITE_PHYSICAL)
dmu_buf_will_rewrite(dbp[i], tx); dmu_buf_will_rewrite(dbp[i], tx);

View File

@ -309,7 +309,8 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
tags = ['functional', 'cli_root', 'zfs_reservation'] tags = ['functional', 'cli_root', 'zfs_reservation']
[tests/functional/cli_root/zfs_rewrite] [tests/functional/cli_root/zfs_rewrite]
tests = ['zfs_rewrite', 'zfs_rewrite_physical'] tests = ['zfs_rewrite', 'zfs_rewrite_physical', 'zfs_rewrite_skip_clone',
'zfs_rewrite_skip_snapshot']
tags = ['functional', 'cli_root', 'zfs_rewrite'] tags = ['functional', 'cli_root', 'zfs_rewrite']
[tests/functional/cli_root/zfs_rollback] [tests/functional/cli_root/zfs_rollback]

View File

@ -195,7 +195,8 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
tags = ['functional', 'cli_root', 'zfs_reservation'] tags = ['functional', 'cli_root', 'zfs_reservation']
[tests/functional/cli_root/zfs_rewrite] [tests/functional/cli_root/zfs_rewrite]
tests = ['zfs_rewrite', 'zfs_rewrite_physical'] tests = ['zfs_rewrite', 'zfs_rewrite_physical', 'zfs_rewrite_skip_clone',
'zfs_rewrite_skip_snapshot']
tags = ['functional', 'cli_root', 'zfs_rewrite'] tags = ['functional', 'cli_root', 'zfs_rewrite']
[tests/functional/cli_root/zfs_rollback] [tests/functional/cli_root/zfs_rollback]

View File

@ -3943,4 +3943,26 @@ function pop_coredump_pattern
esac esac
} }
#
# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2 [key]
#
# Returns a space-separated list of the indexes (starting at 0) of the L0
# blocks that are shared between both files (by first DVA and checksum).
#
function get_same_blocks # dataset1 file1 dataset2 file2 [key]
{
typeset KEY=$5
if [ ${#KEY} -gt 0 ]; then
KEY="--key=$KEY"
fi
typeset zdbout1=$(mktemp)
typeset zdbout2=$(mktemp)
zdb $KEY -vvvvv $1 -O $2 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout1
zdb $KEY -vvvvv $3 -O $4 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout2
echo $(sort -n $zdbout1 $zdbout2 | uniq -d | cut -f1 -d' ')
rm -f $zdbout1 $zdbout2
}
. ${STF_SUITE}/include/kstat.shlib . ${STF_SUITE}/include/kstat.shlib

View File

@ -876,6 +876,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zfs_rewrite/setup.ksh \ functional/cli_root/zfs_rewrite/setup.ksh \
functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \ functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \
functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \ functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \
functional/cli_root/zfs_rewrite/zfs_rewrite_skip_clone.ksh \
functional/cli_root/zfs_rewrite/zfs_rewrite_skip_snapshot.ksh \
functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \
functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \
functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \

View File

@ -35,27 +35,3 @@ function have_same_content
log_must [ "$hash1" = "$hash2" ] log_must [ "$hash1" = "$hash2" ]
} }
#
# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2
#
# Returns a space-separated list of the indexes (starting at 0) of the L0
# blocks that are shared between both files (by first DVA and checksum).
# Assumes that the two files have the same content, use have_same_content to
# confirm that.
#
function get_same_blocks
{
KEY=$5
if [ ${#KEY} -gt 0 ]; then
KEY="--key=$KEY"
fi
typeset zdbout1=$(mktemp)
typeset zdbout2=$(mktemp)
zdb $KEY -vvvvv $1 -O $2 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout1
zdb $KEY -vvvvv $3 -O $4 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout2
echo $(sort -n $zdbout1 $zdbout2 | uniq -d | cut -f1 -d' ')
rm -f $zdbout1 $zdbout2
}

View File

@ -0,0 +1,83 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2026, iXsystems, Inc.
#
# DESCRIPTION:
# Verify zfs rewrite -C flag skips BRT-cloned blocks.
#
# STRATEGY:
# 1. Create a test file and sync it.
# 2. Clone the file using block cloning to share blocks via BRT.
# 3. Rewrite clone with -C flag and verify blocks are NOT rewritten.
# 4. Rewrite clone without -C flag and verify blocks ARE rewritten.
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
function cleanup
{
rm -rf $TESTDIR/*
}
log_assert "zfs rewrite -C flag skips BRT-cloned blocks"
log_onexit cleanup
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
# Create source file (4 x 128KB = 4 blocks)
log_must dd if=/dev/urandom of=$TESTDIR/source bs=128k count=4
log_must sync_pool $TESTPOOL
# Clone the file using block cloning
log_must clonefile -f $TESTDIR/source $TESTDIR/clone
log_must sync_pool $TESTPOOL
# Verify blocks are actually shared initially
typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS source \
$TESTPOOL/$TESTFS clone)
log_must [ "$blocks" = "0 1 2 3" ]
# Test 1: Rewrite clone WITH -C flag (should skip all cloned blocks)
log_must zfs rewrite -C $TESTDIR/clone
log_must sync_pool $TESTPOOL
# Blocks should still be shared (all blocks were skipped)
typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS source \
$TESTPOOL/$TESTFS clone)
log_must [ "$blocks" = "0 1 2 3" ]
# Test 2: Rewrite clone WITHOUT -C flag (should rewrite all blocks)
log_must zfs rewrite $TESTDIR/clone
log_must sync_pool $TESTPOOL
# No blocks should be shared (clone has new blocks)
typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS source \
$TESTPOOL/$TESTFS clone)
log_must [ -z "$blocks" ]
log_pass

View File

@ -0,0 +1,74 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2026, iXsystems, Inc.
#
# DESCRIPTION:
# Verify zfs rewrite -S flag skips snapshot-shared blocks.
#
# STRATEGY:
# 1. Create a test file and sync it.
# 2. Take a snapshot to share the blocks.
# 3. Rewrite with -S flag and verify blocks are NOT rewritten.
# 4. Rewrite without -S flag and verify blocks ARE rewritten.
. $STF_SUITE/include/libtest.shlib
function cleanup
{
rm -rf $TESTDIR/*
zfs destroy -R $TESTPOOL/$TESTFS@snap1 2>/dev/null || true
}
log_assert "zfs rewrite -S flag skips snapshot-shared blocks"
log_onexit cleanup
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
# Create test file (4 x 128KB = 4 blocks) and snapshot
log_must dd if=/dev/urandom of=$TESTDIR/testfile bs=128k count=4
log_must sync_pool $TESTPOOL
log_must zfs snapshot $TESTPOOL/$TESTFS@snap1
# Test 1: Rewrite WITH -S flag (should skip all snapshot-shared blocks)
log_must zfs rewrite -S $TESTDIR/testfile
log_must sync_pool $TESTPOOL
# All blocks should still be shared (all blocks were skipped)
typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS testfile \
$TESTPOOL/$TESTFS@snap1 testfile)
log_must [ "$blocks" = "0 1 2 3" ]
# Test 2: Rewrite WITHOUT -S flag (should rewrite all blocks)
log_must zfs rewrite $TESTDIR/testfile
log_must sync_pool $TESTPOOL
# No blocks should be shared (all blocks were rewritten)
typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS testfile \
$TESTPOOL/$TESTFS@snap1 testfile)
log_must [ -z "$blocks" ]
log_pass