mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 03:37:45 +03:00
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Signed-off-by: Sara Hartse <sara.hartse@delphix.com> Closes #8416
This commit is contained in:
committed by
Brian Behlendorf
parent
d274ac5460
commit
37f03da8ba
+91
-23
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
*/
|
||||
|
||||
@@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
|
||||
size = BPOBJ_SIZE_V0;
|
||||
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
|
||||
size = BPOBJ_SIZE_V1;
|
||||
else if (!spa_feature_is_active(dmu_objset_spa(os),
|
||||
SPA_FEATURE_LIVELIST))
|
||||
size = BPOBJ_SIZE_V2;
|
||||
else
|
||||
size = sizeof (bpobj_phys_t);
|
||||
|
||||
@@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
|
||||
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
|
||||
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
|
||||
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
|
||||
bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
|
||||
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
|
||||
return (0);
|
||||
}
|
||||
@@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
|
||||
* Update bpobj and all of its parents with new space accounting.
|
||||
*/
|
||||
static void
|
||||
propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
|
||||
uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
|
||||
propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
|
||||
int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
|
||||
for (; bpi != NULL; bpi = bpi->bpi_parent) {
|
||||
@@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
|
||||
|
||||
static int
|
||||
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
dmu_tx_t *tx, boolean_t free)
|
||||
int64_t start, dmu_tx_t *tx, boolean_t free)
|
||||
{
|
||||
int err = 0;
|
||||
uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
|
||||
int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
|
||||
dmu_buf_t *dbuf = NULL;
|
||||
bpobj_t *bpo = bpi->bpi_bpo;
|
||||
|
||||
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
|
||||
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
|
||||
uint64_t offset = i * sizeof (blkptr_t);
|
||||
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
|
||||
|
||||
if (dbuf == NULL || dbuf->db_offset > offset) {
|
||||
if (dbuf)
|
||||
dmu_buf_rele(dbuf, FTAG);
|
||||
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
|
||||
FTAG, &dbuf, 0);
|
||||
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
|
||||
offset, FTAG, &dbuf, 0);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
@@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
|
||||
blkptr_t *bparray = dbuf->db_data;
|
||||
blkptr_t *bp = &bparray[blkoff];
|
||||
err = func(arg, bp, tx);
|
||||
|
||||
boolean_t bp_freed = BP_GET_FREE(bp);
|
||||
err = func(arg, bp, bp_freed, tx);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
if (free) {
|
||||
int sign = bp_freed ? -1 : +1;
|
||||
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
|
||||
freed += bp_get_dsize_sync(spa, bp);
|
||||
comp_freed += BP_GET_PSIZE(bp);
|
||||
uncomp_freed += BP_GET_UCSIZE(bp);
|
||||
freed += sign * bp_get_dsize_sync(spa, bp);
|
||||
comp_freed += sign * BP_GET_PSIZE(bp);
|
||||
uncomp_freed += sign * BP_GET_UCSIZE(bp);
|
||||
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
|
||||
bpo->bpo_phys->bpo_num_blkptrs--;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
|
||||
if (bp_freed) {
|
||||
ASSERT(bpo->bpo_havefreed);
|
||||
bpo->bpo_phys->bpo_num_freed--;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (free) {
|
||||
@@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
*/
|
||||
static int
|
||||
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
dmu_tx_t *tx, boolean_t free)
|
||||
dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
|
||||
{
|
||||
list_t stack;
|
||||
bpobj_info_t *bpi;
|
||||
@@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
list_create(&stack, sizeof (bpobj_info_t),
|
||||
offsetof(bpobj_info_t, bpi_node));
|
||||
mutex_enter(&initial_bpo->bpo_lock);
|
||||
|
||||
if (bpobj_size != NULL)
|
||||
*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
|
||||
|
||||
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
|
||||
|
||||
while ((bpi = list_head(&stack)) != NULL) {
|
||||
@@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
|
||||
if (bpi->bpi_visited == B_FALSE) {
|
||||
err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
|
||||
err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
|
||||
free);
|
||||
bpi->bpi_visited = B_TRUE;
|
||||
if (err != 0)
|
||||
break;
|
||||
@@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
* We have unprocessed subobjs. Process the next one.
|
||||
*/
|
||||
ASSERT(bpo->bpo_havecomp);
|
||||
ASSERT3P(bpobj_size, ==, NULL);
|
||||
|
||||
/* Add the last subobj to stack. */
|
||||
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
|
||||
@@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
int
|
||||
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate the entries. If func returns nonzero, iteration will stop.
|
||||
*
|
||||
* If there are no subobjs:
|
||||
*
|
||||
* *bpobj_size can be used to return the number of block pointers in the
|
||||
* bpobj. Note that this may be different from the number of block pointers
|
||||
* that are iterated over, if iteration is terminated early (e.g. by the func
|
||||
* returning nonzero).
|
||||
*
|
||||
* If there are concurrent (or subsequent) modifications to the bpobj then the
|
||||
* returned *bpobj_size can be passed as "start" to
|
||||
* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
|
||||
*/
|
||||
int
|
||||
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
|
||||
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
|
||||
uint64_t *bpobj_size)
|
||||
{
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
|
||||
return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over the blkptrs in the bpobj beginning at index start. If func
|
||||
* returns nonzero, iteration will stop. This is a livelist specific function
|
||||
* since it assumes that there are no subobjs present.
|
||||
*/
|
||||
int
|
||||
livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
|
||||
int64_t start)
|
||||
{
|
||||
if (bpo->bpo_havesubobj)
|
||||
VERIFY0(bpo->bpo_phys->bpo_subobjs);
|
||||
bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
|
||||
int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
|
||||
kmem_free(bpi, sizeof (bpobj_info_t));
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
void
|
||||
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
blkptr_t stored_bp = *bp;
|
||||
uint64_t offset;
|
||||
@@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
|
||||
}
|
||||
|
||||
/* We never need the fill count. */
|
||||
stored_bp.blk_fill = 0;
|
||||
BP_SET_FREE(&stored_bp, bp_freed);
|
||||
|
||||
mutex_enter(&bpo->bpo_lock);
|
||||
|
||||
@@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
bpo->bpo_phys->bpo_num_blkptrs++;
|
||||
bpo->bpo_phys->bpo_bytes +=
|
||||
int sign = bp_freed ? -1 : +1;
|
||||
bpo->bpo_phys->bpo_bytes += sign *
|
||||
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
|
||||
if (bpo->bpo_havecomp) {
|
||||
bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
|
||||
bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
|
||||
bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
|
||||
bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
|
||||
}
|
||||
if (bp_freed) {
|
||||
ASSERT(bpo->bpo_havefreed);
|
||||
bpo->bpo_phys->bpo_num_freed++;
|
||||
}
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
}
|
||||
@@ -799,7 +852,7 @@ struct space_range_arg {
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
struct space_range_arg *sra = arg;
|
||||
|
||||
@@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
|
||||
*uncompp = sra.uncomp;
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
|
||||
* bpobj are designated as free or allocated that information is not preserved
|
||||
* in bplists.
|
||||
*/
|
||||
/* ARGSUSED */
|
||||
int
|
||||
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
bplist_t *bpl = arg;
|
||||
bplist_append(bpl, bp);
|
||||
return (0);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user