mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
BRT: Fix FICLONE/FICLONERANGE shortened copy
On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls are expected to either fully clone the specified range or return an error. The range may be for an entire file. While internally ZFS supports cloning partial ranges there's no way to return the length cloned to the caller so we need to make this all or nothing. As part of this change support for the REMAP_FILE_CAN_SHORTEN flag has been added. When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range() will return a shortened range when encountering pending dirty records. When it's clear zfs_clone_range() will block and wait for the records to be written out allowing the blocks to be cloned. Furthermore, the file range lock is held over the region being cloned to prevent it from being modified while cloning. This doesn't quite provide an atomic semantics since if an error is encountered only a portion of the range may be cloned. This will be converted to an error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the caller. However, the destination file range is left in an undefined state. A test case has been added which exercises this functionality by verifying that `cp --reflink=never|auto|always` works correctly. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #15728 Closes #15842
This commit is contained in:
+38
-5
@@ -58,6 +58,26 @@
|
||||
#include <sys/zfs_vfsops.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
|
||||
/*
|
||||
* Enable the experimental block cloning feature. If this setting is 0, then
|
||||
* even if feature@block_cloning is enabled, attempts to clone blocks will act
|
||||
* as though the feature is disabled.
|
||||
*/
|
||||
int zfs_bclone_enabled = 0;
|
||||
|
||||
/*
|
||||
* When set zfs_clone_range() waits for dirty data to be written to disk.
|
||||
* This allows the clone operation to reliably succeed when a file is modified
|
||||
* and then immediately cloned. For small files this may be slower than making
|
||||
* a copy of the file and is therefore not the default. However, in certain
|
||||
* scenarios this behavior may be desirable so a tunable is provided.
|
||||
*/
|
||||
static int zfs_bclone_wait_dirty = 0;
|
||||
|
||||
/*
|
||||
* Maximum bytes to read per chunk in zfs_read().
|
||||
*/
|
||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
|
||||
|
||||
static ulong_t zfs_fsync_sync_cnt = 4;
|
||||
|
||||
@@ -189,8 +209,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
|
||||
return (error);
|
||||
}
|
||||
|
||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
|
||||
|
||||
/*
|
||||
* Read bytes from specified file into supplied buffer.
|
||||
*
|
||||
@@ -1055,6 +1073,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
||||
size_t maxblocks, nbps;
|
||||
uint_t inblksz;
|
||||
uint64_t clear_setid_bits_txg = 0;
|
||||
uint64_t last_synced_txg = 0;
|
||||
|
||||
inoff = *inoffp;
|
||||
outoff = *outoffp;
|
||||
@@ -1293,15 +1312,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
||||
}
|
||||
|
||||
nbps = maxblocks;
|
||||
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
|
||||
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
|
||||
&nbps);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* If we are trying to clone a block that was created
|
||||
* in the current transaction group, error will be
|
||||
* EAGAIN here, which we can just return to the caller
|
||||
* so it can fallback if it likes.
|
||||
* in the current transaction group, the error will be
|
||||
* EAGAIN here. Based on zfs_bclone_wait_dirty either
|
||||
* return a shortened range to the caller so it can
|
||||
* fallback, or wait for the next TXG and check again.
|
||||
*/
|
||||
if (error == EAGAIN && zfs_bclone_wait_dirty) {
|
||||
txg_wait_synced(dmu_objset_pool(inos),
|
||||
last_synced_txg + 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1523,3 +1550,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
|
||||
"Bytes to read per chunk");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
|
||||
"Enable block cloning");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
|
||||
"Wait for dirty blocks when cloning");
|
||||
|
||||
Reference in New Issue
Block a user