Implementation of block cloning for ZFS

Block Cloning allows to manually clone a file (or a subset of its
blocks) into another (or the same) file by just creating additional
references to the data blocks without copying the data itself.
Those references are kept in the Block Reference Tables (BRTs).

The whole design of block cloning is documented in module/zfs/brt.c.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Christian Schwarz <christian.schwarz@nutanix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Closes #13392
This commit is contained in:
Pawel Jakub Dawidek
2023-03-10 20:59:53 +01:00
committed by GitHub
parent da19d919a8
commit 67a1b03791
51 changed files with 3480 additions and 120 deletions
+111 -15
View File
@@ -43,6 +43,7 @@
#include <sys/metaslab.h>
#include <sys/trace_zfs.h>
#include <sys/abd.h>
#include <sys/brt.h>
#include <sys/wmsum.h>
/*
@@ -578,14 +579,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
}
static int
zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
uint64_t first_txg)
zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
int error;
if (lrc->lrc_txtype != TX_WRITE)
return (0);
ASSERT(lrc->lrc_txtype == TX_WRITE);
/*
* If the block is not readable, don't claim it. This can happen
@@ -604,6 +603,57 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
static int
zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
{
const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
const blkptr_t *bp;
spa_t *spa;
uint_t ii;
ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
if (tx == NULL) {
return (0);
}
/*
* XXX: Do we need to byteswap lr?
*/
spa = zilog->zl_spa;
for (ii = 0; ii < lr->lr_nbps; ii++) {
bp = &lr->lr_bps[ii];
/*
* When data in embedded into BP there is no need to create
* BRT entry as there is no data block. Just copy the BP as
* it contains the data.
*/
if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
brt_pending_add(spa, bp, tx);
}
}
return (0);
}
static int
zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
uint64_t first_txg)
{
switch (lrc->lrc_txtype) {
case TX_WRITE:
return (zil_claim_write(zilog, lrc, tx, first_txg));
case TX_CLONE_RANGE:
return (zil_claim_clone_range(zilog, lrc, tx));
default:
return (0);
}
}
static int
zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
uint64_t claim_txg)
@@ -616,23 +666,70 @@ zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
}
static int
zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
uint64_t claim_txg)
zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
ASSERT(lrc->lrc_txtype == TX_WRITE);
/*
* If we previously claimed it, we need to free it.
*/
if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
!BP_IS_HOLE(bp))
if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
!BP_IS_HOLE(bp)) {
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
}
return (0);
}
static int
zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
{
const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
const blkptr_t *bp;
spa_t *spa;
uint_t ii;
ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
if (tx == NULL) {
return (0);
}
spa = zilog->zl_spa;
for (ii = 0; ii < lr->lr_nbps; ii++) {
bp = &lr->lr_bps[ii];
if (!BP_IS_HOLE(bp)) {
zio_free(spa, dmu_tx_get_txg(tx), bp);
}
}
return (0);
}
static int
zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
uint64_t claim_txg)
{
if (claim_txg == 0) {
return (0);
}
switch (lrc->lrc_txtype) {
case TX_WRITE:
return (zil_free_write(zilog, lrc, tx, claim_txg));
case TX_CLONE_RANGE:
return (zil_free_clone_range(zilog, lrc, tx));
default:
return (0);
}
}
static int
zil_lwb_vdev_compare(const void *x1, const void *x2)
{
@@ -1798,13 +1895,12 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
}
/*
* Maximum amount of write data that can be put into single log block.
* Maximum amount of data that can be put into single log block.
*/
uint64_t
zil_max_log_data(zilog_t *zilog)
zil_max_log_data(zilog_t *zilog, size_t hdrsize)
{
return (zilog->zl_max_block_size -
sizeof (zil_chain_t) - sizeof (lr_write_t));
return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
}
/*
@@ -1814,7 +1910,7 @@ zil_max_log_data(zilog_t *zilog)
static inline uint64_t
zil_max_waste_space(zilog_t *zilog)
{
return (zil_max_log_data(zilog) / 8);
return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8);
}
/*
@@ -1887,7 +1983,7 @@ cont:
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
max_log_data = zil_max_log_data(zilog);
max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 ||