mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 03:08:51 +03:00
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Signed-off-by: Sara Hartse <sara.hartse@delphix.com> Closes #8416
This commit is contained in:
committed by
Brian Behlendorf
parent
d274ac5460
commit
37f03da8ba
+319
-12
@@ -20,16 +20,16 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/dsl_dataset.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/dsl_pool.h>
|
||||
#include <sys/dsl_dataset.h>
|
||||
|
||||
/*
|
||||
* Deadlist concurrency:
|
||||
@@ -51,6 +51,68 @@
|
||||
* provides its own locking, and dl_oldfmt is immutable.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Livelist Overview
|
||||
* ================
|
||||
*
|
||||
* Livelists use the same 'deadlist_t' struct as deadlists and are also used
|
||||
* to track blkptrs over the lifetime of a dataset. Livelists however, belong
|
||||
* to clones and track the blkptrs that are clone-specific (were born after
|
||||
* the clone's creation). The exception is embedded block pointers which are
|
||||
* not included in livelists because they do not need to be freed.
|
||||
*
|
||||
* When it comes time to delete the clone, the livelist provides a quick
|
||||
* reference as to what needs to be freed. For this reason, livelists also track
|
||||
* when clone-specific blkptrs are freed before deletion to prevent double
|
||||
* frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
|
||||
* deletion algorithm iterates backwards over the livelist, matching
|
||||
* FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
|
||||
* are also updated in the case when blkptrs are remapped: the old version
|
||||
* of the blkptr is cancelled out with a FREE and the new version is tracked
|
||||
* with an ALLOC.
|
||||
*
|
||||
* To bound the amount of memory required for deletion, livelists over a
|
||||
* certain size are spread over multiple entries. Entries are grouped by
|
||||
* birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
|
||||
* be in the same entry. This allows us to delete livelists incrementally
|
||||
* over multiple syncs, one entry at a time.
|
||||
*
|
||||
* During the lifetime of the clone, livelists can get extremely large.
|
||||
* Their size is managed by periodic condensing (preemptively cancelling out
|
||||
* FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
|
||||
* the shared space between the clone and its origin is so small that it
|
||||
* doesn't make sense to use livelists anymore.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The threshold sublist size at which we create a new sub-livelist for the
|
||||
* next txg. However, since blkptrs of the same transaction group must be in
|
||||
* the same sub-list, the actual sublist size may exceed this. When picking the
|
||||
* size we had to balance the fact that larger sublists mean fewer sublists
|
||||
* (decreasing the cost of insertion) against the consideration that sublists
|
||||
* will be loaded into memory and shouldn't take up an inordinate amount of
|
||||
* space. We settled on ~500000 entries, corresponding to roughly 128M.
|
||||
*/
|
||||
unsigned long zfs_livelist_max_entries = 500000;
|
||||
|
||||
/*
|
||||
* We can approximate how much of a performance gain a livelist will give us
|
||||
* based on the percentage of blocks shared between the clone and its origin.
|
||||
* 0 percent shared means that the clone has completely diverged and that the
|
||||
* old method is maximally effective: every read from the block tree will
|
||||
* result in lots of frees. Livelists give us gains when they track blocks
|
||||
* scattered across the tree, when one read in the old method might only
|
||||
* result in a few frees. Once the clone has been overwritten enough,
|
||||
* writes are no longer sparse and we'll no longer get much of a benefit from
|
||||
* tracking them with a livelist. We chose a lower limit of 75 percent shared
|
||||
* (25 percent overwritten). This means that 1/4 of all block pointers will be
|
||||
* freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
|
||||
* to make deletion 4x faster. Once the amount of shared space drops below this
|
||||
* threshold, the clone will revert to the old deletion method.
|
||||
*/
|
||||
int zfs_livelist_min_percent_shared = 75;
|
||||
|
||||
|
||||
static int
|
||||
dsl_deadlist_compare(const void *arg1, const void *arg2)
|
||||
{
|
||||
@@ -88,6 +150,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
|
||||
dl->dl_havetree = B_TRUE;
|
||||
}
|
||||
|
||||
void
|
||||
dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
|
||||
ASSERT(dsl_deadlist_is_open(dl));
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
for (dle = avl_first(&dl->dl_tree); dle != NULL;
|
||||
dle = AVL_NEXT(&dl->dl_tree, dle)) {
|
||||
if (func(args, dle) != 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
|
||||
{
|
||||
@@ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
|
||||
|
||||
static void
|
||||
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||
const blkptr_t *bp, dmu_tx_t *tx)
|
||||
const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&dl->dl_lock));
|
||||
if (dle->dle_bpobj.bpo_object ==
|
||||
@@ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||
VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
|
||||
dle->dle_mintxg, obj, tx));
|
||||
}
|
||||
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
|
||||
bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||
}
|
||||
|
||||
void
|
||||
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_entry_t dle_tofind;
|
||||
dsl_deadlist_entry_t *dle;
|
||||
avl_index_t where;
|
||||
|
||||
if (dl->dl_oldfmt) {
|
||||
bpobj_enqueue(&dl->dl_bpobj, bp, tx);
|
||||
bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_deadlist_load_tree(dl);
|
||||
|
||||
dmu_buf_will_dirty(dl->dl_dbuf, tx);
|
||||
|
||||
int sign = bp_freed ? -1 : +1;
|
||||
dl->dl_phys->dl_used +=
|
||||
bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
|
||||
dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
|
||||
dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
|
||||
sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
|
||||
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
|
||||
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
|
||||
|
||||
dle_tofind.dle_mintxg = bp->blk_birth;
|
||||
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
|
||||
@@ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
ASSERT3P(dle, !=, NULL);
|
||||
dle_enqueue(dl, dle, bp, tx);
|
||||
dle_enqueue(dl, dle, bp, bp_freed, tx);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
int
|
||||
dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, B_FALSE, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, B_TRUE, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert new key in deadlist, which must be > all current entries.
|
||||
* mintxg is not inclusive.
|
||||
@@ -316,6 +414,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a deadlist entry and all of its contents by removing the entry from
|
||||
* the deadlist's avl tree, freeing the entry's bpobj and adjusting the
|
||||
* deadlist's space accounting accordingly.
|
||||
*/
|
||||
void
|
||||
dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
|
||||
{
|
||||
uint64_t used, comp, uncomp;
|
||||
dsl_deadlist_entry_t dle_tofind;
|
||||
dsl_deadlist_entry_t *dle;
|
||||
objset_t *os = dl->dl_os;
|
||||
|
||||
if (dl->dl_oldfmt)
|
||||
return;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
|
||||
dle_tofind.dle_mintxg = mintxg;
|
||||
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
|
||||
VERIFY3P(dle, !=, NULL);
|
||||
|
||||
avl_remove(&dl->dl_tree, dle);
|
||||
VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
|
||||
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
|
||||
dl->dl_phys->dl_used -= used;
|
||||
dl->dl_phys->dl_comp -= comp;
|
||||
dl->dl_phys->dl_uncomp -= uncomp;
|
||||
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
|
||||
bpobj_decr_empty(os, tx);
|
||||
} else {
|
||||
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
|
||||
}
|
||||
bpobj_close(&dle->dle_bpobj);
|
||||
kmem_free(dle, sizeof (*dle));
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear out the contents of a deadlist_entry by freeing its bpobj,
|
||||
* replacing it with an empty bpobj and adjusting the deadlist's
|
||||
* space accounting
|
||||
*/
|
||||
void
|
||||
dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
uint64_t new_obj, used, comp, uncomp;
|
||||
objset_t *os = dl->dl_os;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
|
||||
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
|
||||
dl->dl_phys->dl_used -= used;
|
||||
dl->dl_phys->dl_comp -= comp;
|
||||
dl->dl_phys->dl_uncomp -= uncomp;
|
||||
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
|
||||
bpobj_decr_empty(os, tx);
|
||||
else
|
||||
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
|
||||
bpobj_close(&dle->dle_bpobj);
|
||||
new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
|
||||
VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
|
||||
VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
|
||||
new_obj, tx));
|
||||
ASSERT(bpobj_is_empty(&dle->dle_bpobj));
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the first entry in deadlist's avl tree
|
||||
*/
|
||||
dsl_deadlist_entry_t *
|
||||
dsl_deadlist_first(dsl_deadlist_t *dl)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
dle = avl_first(&dl->dl_tree);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
|
||||
return (dle);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the last entry in deadlist's avl tree
|
||||
*/
|
||||
dsl_deadlist_entry_t *
|
||||
dsl_deadlist_last(dsl_deadlist_t *dl)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
dle = avl_last(&dl->dl_tree);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
|
||||
return (dle);
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk ds's snapshots to regenerate generate ZAP & AVL.
|
||||
*/
|
||||
@@ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, tx);
|
||||
dsl_deadlist_insert(dl, bp, bp_freed, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
|
||||
}
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
typedef struct livelist_entry {
|
||||
const blkptr_t *le_bp;
|
||||
avl_node_t le_node;
|
||||
} livelist_entry_t;
|
||||
|
||||
static int
|
||||
livelist_compare(const void *larg, const void *rarg)
|
||||
{
|
||||
const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
|
||||
const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
|
||||
|
||||
/* Sort them according to dva[0] */
|
||||
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
|
||||
uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
|
||||
|
||||
if (l_dva0_vdev != r_dva0_vdev)
|
||||
return (AVL_CMP(l_dva0_vdev, r_dva0_vdev));
|
||||
|
||||
/* if vdevs are equal, sort by offsets. */
|
||||
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
|
||||
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
|
||||
if (l_dva0_offset == r_dva0_offset)
|
||||
ASSERT3U(l->blk_birth, ==, r->blk_birth);
|
||||
return (AVL_CMP(l_dva0_offset, r_dva0_offset));
|
||||
}
|
||||
|
||||
struct livelist_iter_arg {
|
||||
avl_tree_t *avl;
|
||||
bplist_t *to_free;
|
||||
zthr_t *t;
|
||||
};
|
||||
|
||||
/*
|
||||
* Expects an AVL tree which is incrementally filled will FREE blkptrs
|
||||
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
|
||||
* corresponding FREE are stored in the supplied bplist.
|
||||
*/
|
||||
static int
|
||||
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
struct livelist_iter_arg *lia = arg;
|
||||
avl_tree_t *avl = lia->avl;
|
||||
bplist_t *to_free = lia->to_free;
|
||||
zthr_t *t = lia->t;
|
||||
ASSERT(tx == NULL);
|
||||
|
||||
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
|
||||
return (SET_ERROR(EINTR));
|
||||
if (bp_freed) {
|
||||
livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
|
||||
KM_SLEEP);
|
||||
blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
|
||||
*temp_bp = *bp;
|
||||
node->le_bp = temp_bp;
|
||||
avl_add(avl, node);
|
||||
} else {
|
||||
livelist_entry_t node;
|
||||
node.le_bp = bp;
|
||||
livelist_entry_t *found = avl_find(avl, &node, NULL);
|
||||
if (found != NULL) {
|
||||
avl_remove(avl, found);
|
||||
kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
|
||||
kmem_free(found, sizeof (livelist_entry_t));
|
||||
} else {
|
||||
bplist_append(to_free, bp);
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
|
||||
* which have an ALLOC entry but no matching FREE
|
||||
*/
|
||||
int
|
||||
dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
|
||||
uint64_t *size)
|
||||
{
|
||||
avl_tree_t avl;
|
||||
avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
|
||||
offsetof(livelist_entry_t, le_node));
|
||||
|
||||
/* process the sublist */
|
||||
struct livelist_iter_arg arg = {
|
||||
.avl = &avl,
|
||||
.to_free = to_free,
|
||||
.t = t
|
||||
};
|
||||
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
|
||||
|
||||
avl_destroy(&avl);
|
||||
return (err);
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
/* CSTYLED */
|
||||
module_param(zfs_livelist_max_entries, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_max_entries,
|
||||
"Size to start the next sub-livelist in a livelist");
|
||||
|
||||
module_param(zfs_livelist_min_percent_shared, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_min_percent_shared,
|
||||
"Threshold at which livelist is disabled");
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user