Fast Clone Deletion

Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.

This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.

We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.

Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Closes #8416
This commit is contained in:
Sara Hartse
2019-07-26 10:54:14 -07:00
committed by Brian Behlendorf
parent d274ac5460
commit 37f03da8ba
38 changed files with 2583 additions and 205 deletions
+103 -1
View File
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
* Copyright (c) 2014 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -48,6 +48,7 @@
#include <sys/policy.h>
#include <sys/zfs_znode.h>
#include <sys/zvol.h>
#include <sys/zthr.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu)
spa_async_close(dd->dd_pool->dp_spa, dd);
if (dsl_deadlist_is_open(&dd->dd_livelist))
dsl_dir_livelist_close(dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
@@ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_origin_txg =
origin_phys->ds_creation_txg;
dmu_buf_rele(origin_bonus, FTAG);
if (dsl_dir_is_zapified(dd)) {
uint64_t obj;
err = zap_lookup(dp->dp_meta_objset,
dd->dd_object, DD_FIELD_LIVELIST,
sizeof (uint64_t), 1, &obj);
if (err == 0)
dsl_dir_livelist_open(dd, obj);
else if (err != ENOENT)
goto errout;
}
}
dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
@@ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
if (winner != NULL) {
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
if (dsl_deadlist_is_open(&dd->dd_livelist))
dsl_dir_livelist_close(dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
@@ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
errout:
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
if (dsl_deadlist_is_open(&dd->dd_livelist))
dsl_dir_livelist_close(dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
@@ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd)
return (doi.doi_type == DMU_OTN_ZAP_METADATA);
}
void
dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
SPA_FEATURE_LIVELIST));
dsl_deadlist_open(&dd->dd_livelist, mos, obj);
bplist_create(&dd->dd_pending_allocs);
bplist_create(&dd->dd_pending_frees);
}
void
dsl_dir_livelist_close(dsl_dir_t *dd)
{
dsl_deadlist_close(&dd->dd_livelist);
bplist_destroy(&dd->dd_pending_allocs);
bplist_destroy(&dd->dd_pending_frees);
}
void
dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
{
uint64_t obj;
dsl_pool_t *dp = dmu_tx_pool(tx);
spa_t *spa = dp->dp_spa;
livelist_condense_entry_t to_condense = spa->spa_to_condense;
if (!dsl_deadlist_is_open(&dd->dd_livelist))
return;
/*
* If the livelist being removed is set to be condensed, stop the
* condense zthr and indicate the cancellation in the spa_to_condense
* struct in case the condense no-wait synctask has already started
*/
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
if (ll_condense_thread != NULL &&
(to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
/*
* We use zthr_wait_cycle_done instead of zthr_cancel
* because we don't want to destroy the zthr, just have
* it skip its current task.
*/
spa->spa_to_condense.cancelled = B_TRUE;
zthr_wait_cycle_done(ll_condense_thread);
/*
* If we've returned from zthr_wait_cycle_done without
* clearing the to_condense data structure it's either
* because the no-wait synctask has started (which is
* indicated by 'syncing' field of to_condense) and we
* can expect it to clear to_condense on its own.
* Otherwise, we returned before the zthr ran. The
* checkfunc will now fail as cancelled == B_TRUE so we
* can safely NULL out ds, allowing a different dir's
* livelist to be condensed.
*
* We can be sure that the to_condense struct will not
* be repopulated at this stage because both this
* function and dsl_livelist_try_condense execute in
* syncing context.
*/
if ((spa->spa_to_condense.ds != NULL) &&
!spa->spa_to_condense.syncing) {
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
spa);
spa->spa_to_condense.ds = NULL;
}
}
dsl_dir_livelist_close(dd);
int err = zap_lookup(dp->dp_meta_objset, dd->dd_object,
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj);
if (err == 0) {
VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
DD_FIELD_LIVELIST, tx));
if (total) {
dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
}
} else {
ASSERT3U(err, !=, ENOENT);
}
}
#if defined(_KERNEL)
EXPORT_SYMBOL(dsl_dir_set_quota);
EXPORT_SYMBOL(dsl_dir_set_reservation);