Implementation of block cloning for ZFS

Block Cloning allows to manually clone a file (or a subset of its
blocks) into another (or the same) file by just creating additional
references to the data blocks without copying the data itself.
Those references are kept in the Block Reference Tables (BRTs).

The whole design of block cloning is documented in module/zfs/brt.c.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Christian Schwarz <christian.schwarz@nutanix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Closes #13392
This commit is contained in:
Pawel Jakub Dawidek
2023-03-10 20:59:53 +01:00
committed by GitHub
parent da19d919a8
commit 67a1b03791
51 changed files with 3480 additions and 120 deletions
+2
View File
@@ -97,6 +97,8 @@ __FBSDID("$FreeBSD$");
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0,
"ZFS adaptive replacement cache");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0,
"ZFS Block Reference Table");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0,
+6 -1
View File
@@ -153,7 +153,12 @@ struct vfsops zfs_vfsops = {
.vfs_quotactl = zfs_quotactl,
};
VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
#ifdef VFCF_CROSS_COPY_FILE_RANGE
VFS_SET(zfs_vfsops, zfs,
VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
#else
VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
#endif
/*
* We need to keep a count of active fs's.
+94 -1
View File
@@ -30,7 +30,6 @@
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
@@ -84,6 +83,12 @@
#include <vm/vm_param.h>
#include <sys/zil.h>
#include <sys/zfs_vnops.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <security/mac/mac_framework.h>
#include <sys/dmu_impl.h>
#include <sys/brt.h>
#include <sys/zfeature.h>
#include <vm/vm_object.h>
@@ -6209,6 +6214,93 @@ zfs_deallocate(struct vop_deallocate_args *ap)
}
#endif
#ifndef _SYS_SYSPROTO_H_
struct vop_copy_file_range_args {
struct vnode *a_invp;
off_t *a_inoffp;
struct vnode *a_outvp;
off_t *a_outoffp;
size_t *a_lenp;
unsigned int a_flags;
struct ucred *a_incred;
struct ucred *a_outcred;
struct thread *a_fsizetd;
}
#endif
/*
* TODO: FreeBSD will only call file system-specific copy_file_range() if both
* files resides under the same mountpoint. In case of ZFS we want to be called
* even is files are in different datasets (but on the same pools, but we need
* to check that ourselves).
*/
static int
zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
{
struct vnode *invp = ap->a_invp;
struct vnode *outvp = ap->a_outvp;
struct mount *mp;
struct uio io;
int error;
/*
* TODO: If offset/length is not aligned to recordsize, use
* vn_generic_copy_file_range() on this fragment.
* It would be better to do this after we lock the vnodes, but then we
* need something else than vn_generic_copy_file_range().
*/
/* Lock both vnodes, avoiding risk of deadlock. */
do {
mp = NULL;
error = vn_start_write(outvp, &mp, V_WAIT);
if (error == 0) {
error = vn_lock(outvp, LK_EXCLUSIVE);
if (error == 0) {
if (invp == outvp)
break;
error = vn_lock(invp, LK_SHARED | LK_NOWAIT);
if (error == 0)
break;
VOP_UNLOCK(outvp);
if (mp != NULL)
vn_finished_write(mp);
mp = NULL;
error = vn_lock(invp, LK_SHARED);
if (error == 0)
VOP_UNLOCK(invp);
}
}
if (mp != NULL)
vn_finished_write(mp);
} while (error == 0);
if (error != 0)
return (error);
#ifdef MAC
error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
outvp);
if (error != 0)
goto unlock;
#endif
io.uio_offset = *ap->a_outoffp;
io.uio_resid = *ap->a_lenp;
error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd);
if (error != 0)
goto unlock;
error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
ap->a_outoffp, ap->a_lenp, ap->a_fsizetd->td_ucred);
unlock:
if (invp != outvp)
VOP_UNLOCK(invp);
VOP_UNLOCK(outvp);
if (mp != NULL)
vn_finished_write(mp);
return (error);
}
struct vop_vector zfs_vnodeops;
struct vop_vector zfs_fifoops;
struct vop_vector zfs_shareops;
@@ -6272,6 +6364,7 @@ struct vop_vector zfs_vnodeops = {
#if __FreeBSD_version >= 1400043
.vop_add_writecount = vop_stdadd_writecount_nomsync,
#endif
.vop_copy_file_range = zfs_freebsd_copy_file_range,
};
VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
+26
View File
@@ -34,6 +34,7 @@
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/mntent.h>
#include <sys/u8_textprep.h>
#include <sys/dsl_dataset.h>
@@ -2113,3 +2114,28 @@ zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
return (err);
}
#endif /* _KERNEL */
#ifdef _KERNEL
int
zfs_rlimit_fsize(off_t fsize)
{
struct thread *td = curthread;
off_t lim;
if (td == NULL)
return (0);
lim = lim_cur(td, RLIMIT_FSIZE);
if (__predict_true((uoff_t)fsize <= lim))
return (0);
/*
* The limit is reached.
*/
PROC_LOCK(td->td_proc);
kern_psignal(td->td_proc, SIGXFSZ);
PROC_UNLOCK(td->td_proc);
return (EFBIG);
}
#endif /* _KERNEL */