Fast Clone Deletion

Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.

This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.

We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.

Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Closes #8416
This commit is contained in:
Sara Hartse
2019-07-26 10:54:14 -07:00
committed by Brian Behlendorf
parent d274ac5460
commit 37f03da8ba
38 changed files with 2583 additions and 205 deletions
+91 -23
View File
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2017 Datto Inc.
*/
@@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
size = BPOBJ_SIZE_V0;
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
size = BPOBJ_SIZE_V1;
else if (!spa_feature_is_active(dmu_objset_spa(os),
SPA_FEATURE_LIVELIST))
size = BPOBJ_SIZE_V2;
else
size = sizeof (bpobj_phys_t);
@@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
return (0);
}
@@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
* Update bpobj and all of its parents with new space accounting.
*/
static void
propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
{
for (; bpi != NULL; bpi = bpi->bpi_parent) {
@@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
static int
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
dmu_tx_t *tx, boolean_t free)
int64_t start, dmu_tx_t *tx, boolean_t free)
{
int err = 0;
uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
FTAG, &dbuf, 0);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
offset, FTAG, &dbuf, 0);
if (err)
break;
}
@@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
blkptr_t *bparray = dbuf->db_data;
blkptr_t *bp = &bparray[blkoff];
err = func(arg, bp, tx);
boolean_t bp_freed = BP_GET_FREE(bp);
err = func(arg, bp, bp_freed, tx);
if (err)
break;
if (free) {
int sign = bp_freed ? -1 : +1;
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
freed += bp_get_dsize_sync(spa, bp);
comp_freed += BP_GET_PSIZE(bp);
uncomp_freed += BP_GET_UCSIZE(bp);
freed += sign * bp_get_dsize_sync(spa, bp);
comp_freed += sign * BP_GET_PSIZE(bp);
uncomp_freed += sign * BP_GET_UCSIZE(bp);
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
bpo->bpo_phys->bpo_num_blkptrs--;
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
if (bp_freed) {
ASSERT(bpo->bpo_havefreed);
bpo->bpo_phys->bpo_num_freed--;
ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
}
}
}
if (free) {
@@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
*/
static int
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
dmu_tx_t *tx, boolean_t free)
dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
{
list_t stack;
bpobj_info_t *bpi;
@@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
list_create(&stack, sizeof (bpobj_info_t),
offsetof(bpobj_info_t, bpi_node));
mutex_enter(&initial_bpo->bpo_lock);
if (bpobj_size != NULL)
*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
while ((bpi = list_head(&stack)) != NULL) {
@@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpi->bpi_visited == B_FALSE) {
err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
free);
bpi->bpi_visited = B_TRUE;
if (err != 0)
break;
@@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
* We have unprocessed subobjs. Process the next one.
*/
ASSERT(bpo->bpo_havecomp);
ASSERT3P(bpobj_size, ==, NULL);
/* Add the last subobj to stack. */
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
@@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
int
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
{
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
}
/*
* Iterate the entries. If func returns nonzero, iteration will stop.
*
* If there are no subobjs:
*
* *bpobj_size can be used to return the number of block pointers in the
* bpobj. Note that this may be different from the number of block pointers
* that are iterated over, if iteration is terminated early (e.g. by the func
* returning nonzero).
*
* If there are concurrent (or subsequent) modifications to the bpobj then the
* returned *bpobj_size can be passed as "start" to
* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
*/
int
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
uint64_t *bpobj_size)
{
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
}
/*
* Iterate over the blkptrs in the bpobj beginning at index start. If func
* returns nonzero, iteration will stop. This is a livelist specific function
* since it assumes that there are no subobjs present.
*/
int
livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
int64_t start)
{
if (bpo->bpo_havesubobj)
VERIFY0(bpo->bpo_phys->bpo_subobjs);
bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
kmem_free(bpi, sizeof (bpobj_info_t));
return (err);
}
/*
@@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}
void
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
blkptr_t stored_bp = *bp;
uint64_t offset;
@@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
}
/* We never need the fill count. */
stored_bp.blk_fill = 0;
BP_SET_FREE(&stored_bp, bp_freed);
mutex_enter(&bpo->bpo_lock);
@@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
bpo->bpo_phys->bpo_num_blkptrs++;
bpo->bpo_phys->bpo_bytes +=
int sign = bp_freed ? -1 : +1;
bpo->bpo_phys->bpo_bytes += sign *
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
if (bpo->bpo_havecomp) {
bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
}
if (bp_freed) {
ASSERT(bpo->bpo_havefreed);
bpo->bpo_phys->bpo_num_freed++;
}
mutex_exit(&bpo->bpo_lock);
}
@@ -799,7 +852,7 @@ struct space_range_arg {
/* ARGSUSED */
static int
space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
struct space_range_arg *sra = arg;
@@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
*uncompp = sra.uncomp;
return (err);
}
/*
* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
* bpobj are designated as free or allocated that information is not preserved
* in bplists.
*/
/* ARGSUSED */
int
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
bplist_t *bpl = arg;
bplist_append(bpl, bp);
return (0);
}