Default to zfs_bclone_wait_dirty=1

Update the default FICLONE and FICLONERANGE ioctl behavior to wait
on dirty blocks.  While this does remove some control from the
application, in practice ZFS is better positioned to the optimial
thing and immediately force a TXG sync.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #17455
This commit is contained in:
Brian Behlendorf 2025-07-25 07:42:23 -07:00 committed by Alexander Motin
parent 6d378564b4
commit 582e7847f6
4 changed files with 39 additions and 14 deletions

View File

@ -1384,14 +1384,15 @@ If this setting is 0, then even if feature@block_cloning is enabled,
using functions and system calls that attempt to clone blocks will act as using functions and system calls that attempt to clone blocks will act as
though the feature is disabled. though the feature is disabled.
. .
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
written to disk. data to be written to disk before proceeding.
This allows the clone operation to reliably succeed when a file is This ensures that the clone operation reliably succeeds, even if a file is
modified and then immediately cloned. modified and then immediately cloned.
For small files this may be slower than making a copy of the file. Note that for small files this may be slower than simply copying the file.
Therefore, this setting defaults to 0 which causes a clone operation to When set to 0 the clone operation will immediately fail if it encounters
immediately fail when encountering a dirty block. any dirty blocks.
By default waiting is enabled.
. .
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
Select a BLAKE3 implementation. Select a BLAKE3 implementation.

View File

@ -67,13 +67,14 @@
int zfs_bclone_enabled = 1; int zfs_bclone_enabled = 1;
/* /*
* When set zfs_clone_range() waits for dirty data to be written to disk. * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
* This allows the clone operation to reliably succeed when a file is modified * data to be written to disk before proceeding. This ensures that the clone
* and then immediately cloned. For small files this may be slower than making * operation reliably succeeds, even if a file is modified and then immediately
* a copy of the file and is therefore not the default. However, in certain * cloned. Note that for small files this may be slower than simply copying
* scenarios this behavior may be desirable so a tunable is provided. * the file. When set to 0 the clone operation will immediately fail if it
* encounters any dirty blocks. By default waiting is enabled.
*/ */
int zfs_bclone_wait_dirty = 0; int zfs_bclone_wait_dirty = 1;
/* /*
* Enable Direct I/O. If this setting is 0, then all I/O requests will be * Enable Direct I/O. If this setting is 0, then all I/O requests will be

View File

@ -41,16 +41,22 @@ function cleanup
{ {
datasetexists $TESTPOOL && destroy_pool $TESTPOOL datasetexists $TESTPOOL && destroy_pool $TESTPOOL
set_tunable64 TXG_TIMEOUT $timeout set_tunable64 TXG_TIMEOUT $timeout
log_must restore_tunable BCLONE_WAIT_DIRTY
} }
log_onexit cleanup log_onexit cleanup
log_must save_tunable BCLONE_WAIT_DIRTY
log_must set_tunable64 TXG_TIMEOUT 5000 log_must set_tunable64 TXG_TIMEOUT 5000
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must sync_pool $TESTPOOL true log_must sync_pool $TESTPOOL true
# Verify fallback to copy when there are dirty blocks
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
@ -61,5 +67,20 @@ log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "" ] log_must [ "$blocks" = "" ]
log_must rm /$TESTPOOL/file /$TESTPOOL/clone
# Verify blocks are cloned even when there are dirty blocks
log_must set_tunable32 BCLONE_WAIT_DIRTY 1
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "0 1 2 3" ]
log_pass $claim log_pass $claim

View File

@ -56,7 +56,7 @@ function cleanup
{ {
datasetexists $TESTPOOL/cp-reflink && \ datasetexists $TESTPOOL/cp-reflink && \
destroy_dataset $$TESTPOOL/cp-reflink -f destroy_dataset $$TESTPOOL/cp-reflink -f
log_must set_tunable32 BCLONE_WAIT_DIRTY 0 log_must restore_tunable BCLONE_WAIT_DIRTY
} }
function verify_copy function verify_copy
@ -81,6 +81,8 @@ SRC_SIZE=$((1024 + $RANDOM % 1024))
# A smaller recordsize is used merely to speed up the test. # A smaller recordsize is used merely to speed up the test.
RECORDSIZE=4096 RECORDSIZE=4096
log_must save_tunable BCLONE_WAIT_DIRTY
log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)