mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-27 04:32:16 +03:00
Implementation of block cloning for ZFS
Block Cloning allows to manually clone a file (or a subset of its blocks) into another (or the same) file by just creating additional references to the data blocks without copying the data itself. Those references are kept in the Block Reference Tables (BRTs). The whole design of block cloning is documented in module/zfs/brt.c. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Christian Schwarz <christian.schwarz@nutanix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Rich Ercolani <rincebrain@gmail.com> Signed-off-by: Pawel Jakub Dawidek <pawel@dawidek.net> Closes #13392
This commit is contained in:
committed by
GitHub
parent
da19d919a8
commit
67a1b03791
+152
-1
@@ -29,6 +29,7 @@
|
||||
* Copyright (c) 2019, Klara Inc.
|
||||
* Copyright (c) 2019, Allan Jude
|
||||
* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
|
||||
* Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
|
||||
*/
|
||||
|
||||
#include <sys/dmu.h>
|
||||
@@ -52,6 +53,7 @@
|
||||
#include <sys/sa.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/brt.h>
|
||||
#include <sys/trace_zfs.h>
|
||||
#include <sys/zfs_racct.h>
|
||||
#include <sys/zfs_rlock.h>
|
||||
@@ -513,7 +515,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
zio_t *zio = NULL;
|
||||
boolean_t missed = B_FALSE;
|
||||
|
||||
ASSERT(length <= DMU_MAX_ACCESS);
|
||||
ASSERT(!read || length <= DMU_MAX_ACCESS);
|
||||
|
||||
/*
|
||||
* Note: We directly notify the prefetch code of this read, so that
|
||||
@@ -2165,6 +2167,155 @@ restart:
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
|
||||
dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp)
|
||||
{
|
||||
dmu_buf_t **dbp, *dbuf;
|
||||
dmu_buf_impl_t *db;
|
||||
blkptr_t *bp;
|
||||
int error, numbufs;
|
||||
|
||||
error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
|
||||
&numbufs, &dbp);
|
||||
if (error != 0) {
|
||||
if (error == ESRCH) {
|
||||
error = SET_ERROR(ENXIO);
|
||||
}
|
||||
return (error);
|
||||
}
|
||||
|
||||
ASSERT3U(numbufs, <=, *nbpsp);
|
||||
|
||||
for (int i = 0; i < numbufs; i++) {
|
||||
dbuf = dbp[i];
|
||||
db = (dmu_buf_impl_t *)dbuf;
|
||||
bp = db->db_blkptr;
|
||||
|
||||
/*
|
||||
* If the block is not on the disk yet, it has no BP assigned.
|
||||
* There is not much we can do...
|
||||
*/
|
||||
if (!list_is_empty(&db->db_dirty_records)) {
|
||||
dbuf_dirty_record_t *dr;
|
||||
|
||||
dr = list_head(&db->db_dirty_records);
|
||||
if (dr->dt.dl.dr_brtwrite) {
|
||||
/*
|
||||
* This is very special case where we clone a
|
||||
* block and in the same transaction group we
|
||||
* read its BP (most likely to clone the clone).
|
||||
*/
|
||||
bp = &dr->dt.dl.dr_overridden_by;
|
||||
} else {
|
||||
/*
|
||||
* The block was modified in the same
|
||||
* transaction group.
|
||||
*/
|
||||
error = SET_ERROR(EAGAIN);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (bp == NULL) {
|
||||
/*
|
||||
* The block was created in this transaction group,
|
||||
* so it has no BP yet.
|
||||
*/
|
||||
error = SET_ERROR(EAGAIN);
|
||||
goto out;
|
||||
}
|
||||
if (dmu_buf_is_dirty(dbuf, tx)) {
|
||||
error = SET_ERROR(EAGAIN);
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* Make sure we clone only data blocks.
|
||||
*/
|
||||
if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
|
||||
error = SET_ERROR(EINVAL);
|
||||
goto out;
|
||||
}
|
||||
|
||||
bps[i] = *bp;
|
||||
}
|
||||
|
||||
*nbpsp = numbufs;
|
||||
out:
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
|
||||
dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
|
||||
{
|
||||
spa_t *spa;
|
||||
dmu_buf_t **dbp, *dbuf;
|
||||
dmu_buf_impl_t *db;
|
||||
struct dirty_leaf *dl;
|
||||
dbuf_dirty_record_t *dr;
|
||||
const blkptr_t *bp;
|
||||
int numbufs;
|
||||
|
||||
spa = os->os_spa;
|
||||
|
||||
VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
|
||||
&numbufs, &dbp));
|
||||
ASSERT3U(nbps, ==, numbufs);
|
||||
|
||||
for (int i = 0; i < numbufs; i++) {
|
||||
dbuf = dbp[i];
|
||||
db = (dmu_buf_impl_t *)dbuf;
|
||||
bp = &bps[i];
|
||||
|
||||
ASSERT0(db->db_level);
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
|
||||
|
||||
if (db->db_state == DB_UNCACHED) {
|
||||
/*
|
||||
* XXX-PJD: If the dbuf is already cached, calling
|
||||
* dmu_buf_will_not_fill() will panic on assertion
|
||||
* (db->db_buf == NULL) in dbuf_clear_data(),
|
||||
* which is called from dbuf_noread() in DB_NOFILL
|
||||
* case. I'm not 100% sure this is the right thing
|
||||
* to do, but it seems to work.
|
||||
*/
|
||||
dmu_buf_will_not_fill(dbuf, tx);
|
||||
}
|
||||
|
||||
dr = list_head(&db->db_dirty_records);
|
||||
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
|
||||
dl = &dr->dt.dl;
|
||||
dl->dr_overridden_by = *bp;
|
||||
dl->dr_brtwrite = B_TRUE;
|
||||
|
||||
dl->dr_override_state = DR_OVERRIDDEN;
|
||||
if (BP_IS_HOLE(bp)) {
|
||||
dl->dr_overridden_by.blk_birth = 0;
|
||||
dl->dr_overridden_by.blk_phys_birth = 0;
|
||||
} else {
|
||||
dl->dr_overridden_by.blk_birth = dr->dr_txg;
|
||||
dl->dr_overridden_by.blk_phys_birth =
|
||||
BP_PHYSICAL_BIRTH(bp);
|
||||
}
|
||||
|
||||
/*
|
||||
* When data in embedded into BP there is no need to create
|
||||
* BRT entry as there is no data block. Just copy the BP as
|
||||
* it contains the data.
|
||||
* Also, when replaying ZIL we don't want to bump references
|
||||
* in the BRT as it was already done during ZIL claim.
|
||||
*/
|
||||
if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
|
||||
brt_pending_add(spa, bp, tx);
|
||||
}
|
||||
}
|
||||
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
}
|
||||
|
||||
void
|
||||
__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user