mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-10-24 17:05:01 +03:00
Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Signed-off-by: Sara Hartse <sara.hartse@delphix.com> Closes #8416
This commit is contained in:
parent
d274ac5460
commit
37f03da8ba
332
cmd/zdb/zdb.c
332
cmd/zdb/zdb.c
@ -115,7 +115,8 @@ uint64_t max_inflight = 1000;
|
||||
static int leaked_objects = 0;
|
||||
static range_tree_t *mos_refd_objs;
|
||||
|
||||
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
|
||||
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
|
||||
boolean_t);
|
||||
static void mos_obj_refd(uint64_t);
|
||||
static void mos_obj_refd_multiple(uint64_t);
|
||||
|
||||
@ -552,12 +553,16 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
|
||||
(void) printf("\t\tcomp = %s\n", comp);
|
||||
(void) printf("\t\tuncomp = %s\n", uncomp);
|
||||
}
|
||||
if (size >= sizeof (*bpop)) {
|
||||
if (size >= BPOBJ_SIZE_V2) {
|
||||
(void) printf("\t\tsubobjs = %llu\n",
|
||||
(u_longlong_t)bpop->bpo_subobjs);
|
||||
(void) printf("\t\tnum_subobjs = %llu\n",
|
||||
(u_longlong_t)bpop->bpo_num_subobjs);
|
||||
}
|
||||
if (size >= sizeof (*bpop)) {
|
||||
(void) printf("\t\tnum_freed = %llu\n",
|
||||
(u_longlong_t)bpop->bpo_num_freed);
|
||||
}
|
||||
|
||||
if (dump_opt['d'] < 5)
|
||||
return;
|
||||
@ -572,7 +577,8 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
|
||||
(void) printf("got error %u from dmu_read\n", err);
|
||||
break;
|
||||
}
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
|
||||
BP_GET_FREE(&bp));
|
||||
(void) printf("\t%s\n", blkbuf);
|
||||
}
|
||||
}
|
||||
@ -1508,7 +1514,8 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
|
||||
}
|
||||
|
||||
static void
|
||||
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
|
||||
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
|
||||
boolean_t bp_freed)
|
||||
{
|
||||
const dva_t *dva = bp->blk_dva;
|
||||
int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
|
||||
@ -1516,6 +1523,10 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
|
||||
|
||||
if (dump_opt['b'] >= 6) {
|
||||
snprintf_blkptr(blkbuf, buflen, bp);
|
||||
if (bp_freed) {
|
||||
(void) snprintf(blkbuf + strlen(blkbuf),
|
||||
buflen - strlen(blkbuf), " %s", "FREE");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1553,6 +1564,9 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
|
||||
(u_longlong_t)BP_GET_FILL(bp),
|
||||
(u_longlong_t)bp->blk_birth,
|
||||
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
|
||||
if (bp_freed)
|
||||
(void) snprintf(blkbuf + strlen(blkbuf),
|
||||
buflen - strlen(blkbuf), " %s", "FREE");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1580,7 +1594,7 @@ print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
|
||||
}
|
||||
}
|
||||
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
|
||||
(void) printf("%s\n", blkbuf);
|
||||
}
|
||||
|
||||
@ -1815,12 +1829,12 @@ dump_bptree(objset_t *os, uint64_t obj, const char *name)
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
char blkbuf[BP_SPRINTF_LEN];
|
||||
|
||||
ASSERT(bp->blk_birth != 0);
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
|
||||
(void) printf("\t%s\n", blkbuf);
|
||||
return (0);
|
||||
}
|
||||
@ -1845,14 +1859,28 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
|
||||
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
|
||||
zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
|
||||
zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
|
||||
(void) printf(" %*s: object %llu, %llu local blkptrs, "
|
||||
"%llu subobjs in object, %llu, %s (%s/%s comp)\n",
|
||||
indent * 8, name,
|
||||
(u_longlong_t)bpo->bpo_object,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
|
||||
bytes, comp, uncomp);
|
||||
if (bpo->bpo_havefreed) {
|
||||
(void) printf(" %*s: object %llu, %llu local "
|
||||
"blkptrs, %llu freed, %llu subobjs in object %llu, "
|
||||
"%s (%s/%s comp)\n",
|
||||
indent * 8, name,
|
||||
(u_longlong_t)bpo->bpo_object,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_freed,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
|
||||
bytes, comp, uncomp);
|
||||
} else {
|
||||
(void) printf(" %*s: object %llu, %llu local "
|
||||
"blkptrs, %llu subobjs in object %llu, "
|
||||
"%s (%s/%s comp)\n",
|
||||
indent * 8, name,
|
||||
(u_longlong_t)bpo->bpo_object,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
|
||||
bytes, comp, uncomp);
|
||||
}
|
||||
|
||||
for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
|
||||
uint64_t subobj;
|
||||
@ -1872,11 +1900,22 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
|
||||
bpobj_close(&subbpo);
|
||||
}
|
||||
} else {
|
||||
(void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
|
||||
indent * 8, name,
|
||||
(u_longlong_t)bpo->bpo_object,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
||||
bytes);
|
||||
if (bpo->bpo_havefreed) {
|
||||
(void) printf(" %*s: object %llu, %llu blkptrs, "
|
||||
"%llu freed, %s\n",
|
||||
indent * 8, name,
|
||||
(u_longlong_t)bpo->bpo_object,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_freed,
|
||||
bytes);
|
||||
} else {
|
||||
(void) printf(" %*s: object %llu, %llu blkptrs, "
|
||||
"%s\n",
|
||||
indent * 8, name,
|
||||
(u_longlong_t)bpo->bpo_object,
|
||||
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
||||
bytes);
|
||||
}
|
||||
}
|
||||
|
||||
if (dump_opt['d'] < 5)
|
||||
@ -2038,36 +2077,59 @@ bpobj_count_refd(bpobj_t *bpo)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dump_deadlist(dsl_deadlist_t *dl)
|
||||
static int
|
||||
dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
|
||||
if (dle->dle_bpobj.bpo_object != empty_bpobj)
|
||||
bpobj_count_refd(&dle->dle_bpobj);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
|
||||
{
|
||||
ASSERT(arg == NULL);
|
||||
if (dump_opt['d'] >= 5) {
|
||||
char buf[128];
|
||||
(void) snprintf(buf, sizeof (buf),
|
||||
"mintxg %llu -> obj %llu",
|
||||
(longlong_t)dle->dle_mintxg,
|
||||
(longlong_t)dle->dle_bpobj.bpo_object);
|
||||
|
||||
dump_full_bpobj(&dle->dle_bpobj, buf, 0);
|
||||
} else {
|
||||
(void) printf("mintxg %llu -> obj %llu\n",
|
||||
(longlong_t)dle->dle_mintxg,
|
||||
(longlong_t)dle->dle_bpobj.bpo_object);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_blkptr_list(dsl_deadlist_t *dl, char *name)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
uint64_t unused;
|
||||
char bytes[32];
|
||||
char comp[32];
|
||||
char uncomp[32];
|
||||
uint64_t empty_bpobj =
|
||||
dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj;
|
||||
|
||||
/* force the tree to be loaded */
|
||||
dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
|
||||
char entries[32];
|
||||
spa_t *spa = dmu_objset_spa(dl->dl_os);
|
||||
uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
|
||||
|
||||
if (dl->dl_oldfmt) {
|
||||
if (dl->dl_bpobj.bpo_object != empty_bpobj)
|
||||
bpobj_count_refd(&dl->dl_bpobj);
|
||||
} else {
|
||||
mos_obj_refd(dl->dl_object);
|
||||
for (dle = avl_first(&dl->dl_tree); dle;
|
||||
dle = AVL_NEXT(&dl->dl_tree, dle)) {
|
||||
if (dle->dle_bpobj.bpo_object != empty_bpobj)
|
||||
bpobj_count_refd(&dle->dle_bpobj);
|
||||
}
|
||||
dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
|
||||
}
|
||||
|
||||
/* make sure nicenum has enough space */
|
||||
CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
|
||||
CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
|
||||
CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
|
||||
CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ);
|
||||
|
||||
if (dump_opt['d'] < 3)
|
||||
return;
|
||||
@ -2080,30 +2142,60 @@ dump_deadlist(dsl_deadlist_t *dl)
|
||||
zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
|
||||
zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
|
||||
zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
|
||||
(void) printf("\n Deadlist: %s (%s/%s comp)\n",
|
||||
bytes, comp, uncomp);
|
||||
zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
|
||||
(void) printf("\n %s: %s (%s/%s comp), %s entries\n",
|
||||
name, bytes, comp, uncomp, entries);
|
||||
|
||||
if (dump_opt['d'] < 4)
|
||||
return;
|
||||
|
||||
(void) printf("\n");
|
||||
|
||||
for (dle = avl_first(&dl->dl_tree); dle;
|
||||
dle = AVL_NEXT(&dl->dl_tree, dle)) {
|
||||
if (dump_opt['d'] >= 5) {
|
||||
char buf[128];
|
||||
(void) snprintf(buf, sizeof (buf),
|
||||
"mintxg %llu -> obj %llu",
|
||||
(longlong_t)dle->dle_mintxg,
|
||||
(longlong_t)dle->dle_bpobj.bpo_object);
|
||||
dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
|
||||
}
|
||||
|
||||
dump_full_bpobj(&dle->dle_bpobj, buf, 0);
|
||||
} else {
|
||||
(void) printf("mintxg %llu -> obj %llu\n",
|
||||
(longlong_t)dle->dle_mintxg,
|
||||
(longlong_t)dle->dle_bpobj.bpo_object);
|
||||
}
|
||||
static int
|
||||
verify_dd_livelist(objset_t *os)
|
||||
{
|
||||
uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
|
||||
dsl_pool_t *dp = spa_get_dsl(os->os_spa);
|
||||
dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
|
||||
|
||||
ASSERT(!dmu_objset_is_snapshot(os));
|
||||
if (!dsl_deadlist_is_open(&dd->dd_livelist))
|
||||
return (0);
|
||||
dsl_pool_config_enter(dp, FTAG);
|
||||
dsl_deadlist_space(&dd->dd_livelist, &ll_used,
|
||||
&ll_comp, &ll_uncomp);
|
||||
|
||||
dsl_dataset_t *origin_ds;
|
||||
ASSERT(dsl_pool_config_held(dp));
|
||||
VERIFY0(dsl_dataset_hold_obj(dp,
|
||||
dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
|
||||
VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
|
||||
&used, &comp, &uncomp));
|
||||
dsl_dataset_rele(origin_ds, FTAG);
|
||||
dsl_pool_config_exit(dp, FTAG);
|
||||
/*
|
||||
* It's possible that the dataset's uncomp space is larger than the
|
||||
* livelist's because livelists do not track embedded block pointers
|
||||
*/
|
||||
if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
|
||||
char nice_used[32], nice_comp[32], nice_uncomp[32];
|
||||
(void) printf("Discrepancy in space accounting:\n");
|
||||
zdb_nicenum(used, nice_used, sizeof (nice_used));
|
||||
zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
|
||||
zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
|
||||
(void) printf("dir: used %s, comp %s, uncomp %s\n",
|
||||
nice_used, nice_comp, nice_uncomp);
|
||||
zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
|
||||
zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
|
||||
zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
|
||||
(void) printf("livelist: used %s, comp %s, uncomp %s\n",
|
||||
nice_used, nice_comp, nice_uncomp);
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static avl_tree_t idx_tree;
|
||||
@ -2643,7 +2735,7 @@ static const char *objset_types[DMU_OST_NUMTYPES] = {
|
||||
"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
|
||||
|
||||
static void
|
||||
dump_dir(objset_t *os)
|
||||
dump_objset(objset_t *os)
|
||||
{
|
||||
dmu_objset_stats_t dds;
|
||||
uint64_t object, object_count;
|
||||
@ -2716,11 +2808,17 @@ dump_dir(objset_t *os)
|
||||
|
||||
if (dmu_objset_ds(os) != NULL) {
|
||||
dsl_dataset_t *ds = dmu_objset_ds(os);
|
||||
dump_deadlist(&ds->ds_deadlist);
|
||||
dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
|
||||
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
|
||||
!dmu_objset_is_snapshot(os)) {
|
||||
dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
|
||||
if (verify_dd_livelist(os) != 0)
|
||||
fatal("livelist is incorrect");
|
||||
}
|
||||
|
||||
if (dsl_dataset_remap_deadlist_exists(ds)) {
|
||||
(void) printf("ds_remap_deadlist:\n");
|
||||
dump_deadlist(&ds->ds_remap_deadlist);
|
||||
dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
|
||||
}
|
||||
count_ds_mos_objects(ds);
|
||||
}
|
||||
@ -3470,7 +3568,7 @@ static uint64_t remap_deadlist_count = 0;
|
||||
|
||||
/*ARGSUSED*/
|
||||
static int
|
||||
dump_one_dir(const char *dsname, void *arg)
|
||||
dump_one_objset(const char *dsname, void *arg)
|
||||
{
|
||||
int error;
|
||||
objset_t *os;
|
||||
@ -3502,7 +3600,12 @@ dump_one_dir(const char *dsname, void *arg)
|
||||
global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
|
||||
}
|
||||
|
||||
dump_dir(os);
|
||||
if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
|
||||
!dmu_objset_is_snapshot(os)) {
|
||||
global_feature_count[SPA_FEATURE_LIVELIST]++;
|
||||
}
|
||||
|
||||
dump_objset(os);
|
||||
close_objset(os, FTAG);
|
||||
fuid_table_destroy();
|
||||
return (0);
|
||||
@ -3993,13 +4096,15 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
zdb_cb_t *zcb = arg;
|
||||
spa_t *spa = zcb->zcb_spa;
|
||||
vdev_t *vd;
|
||||
const dva_t *dva = &bp->blk_dva[0];
|
||||
|
||||
ASSERT(!bp_freed);
|
||||
ASSERT(!dump_opt['L']);
|
||||
ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
|
||||
|
||||
@ -4617,6 +4722,101 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over livelists which have been destroyed by the user but
|
||||
* are still present in the MOS, waiting to be freed
|
||||
*/
|
||||
typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
|
||||
|
||||
static void
|
||||
iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
|
||||
{
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
uint64_t zap_obj;
|
||||
int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
|
||||
if (err == ENOENT)
|
||||
return;
|
||||
ASSERT0(err);
|
||||
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t attr;
|
||||
dsl_deadlist_t ll;
|
||||
/* NULL out os prior to dsl_deadlist_open in case it's garbage */
|
||||
ll.dl_os = NULL;
|
||||
for (zap_cursor_init(&zc, mos, zap_obj);
|
||||
zap_cursor_retrieve(&zc, &attr) == 0;
|
||||
(void) zap_cursor_advance(&zc)) {
|
||||
dsl_deadlist_open(&ll, mos, attr.za_first_integer);
|
||||
func(&ll, arg);
|
||||
dsl_deadlist_close(&ll);
|
||||
}
|
||||
zap_cursor_fini(&zc);
|
||||
}
|
||||
|
||||
static int
|
||||
bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(!bp_freed);
|
||||
return (count_block_cb(arg, bp, tx));
|
||||
}
|
||||
|
||||
static int
|
||||
livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
|
||||
{
|
||||
zdb_cb_t *zbc = args;
|
||||
bplist_t blks;
|
||||
bplist_create(&blks);
|
||||
/* determine which blocks have been alloc'd but not freed */
|
||||
VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
|
||||
/* count those blocks */
|
||||
(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
|
||||
bplist_destroy(&blks);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
|
||||
{
|
||||
dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Count the blocks in the livelists that have been destroyed by the user
|
||||
* but haven't yet been freed.
|
||||
*/
|
||||
static void
|
||||
deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
|
||||
{
|
||||
iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
|
||||
{
|
||||
ASSERT3P(arg, ==, NULL);
|
||||
global_feature_count[SPA_FEATURE_LIVELIST]++;
|
||||
dump_blkptr_list(ll, "Deleted Livelist");
|
||||
}
|
||||
|
||||
/*
|
||||
* Print out, register object references to, and increment feature counts for
|
||||
* livelists that have been destroyed by the user but haven't yet been freed.
|
||||
*/
|
||||
static void
|
||||
deleted_livelists_dump_mos(spa_t *spa)
|
||||
{
|
||||
uint64_t zap_obj;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
|
||||
if (err == ENOENT)
|
||||
return;
|
||||
mos_obj_refd(zap_obj);
|
||||
iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
|
||||
}
|
||||
|
||||
static int
|
||||
dump_block_stats(spa_t *spa)
|
||||
{
|
||||
@ -4656,11 +4856,11 @@ dump_block_stats(spa_t *spa)
|
||||
* If there's a deferred-free bplist, process that first.
|
||||
*/
|
||||
(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
|
||||
count_block_cb, &zcb, NULL);
|
||||
bpobj_count_block_cb, &zcb, NULL);
|
||||
|
||||
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
|
||||
(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
|
||||
count_block_cb, &zcb, NULL);
|
||||
bpobj_count_block_cb, &zcb, NULL);
|
||||
}
|
||||
|
||||
zdb_claim_removing(spa, &zcb);
|
||||
@ -4671,6 +4871,8 @@ dump_block_stats(spa_t *spa)
|
||||
&zcb, NULL));
|
||||
}
|
||||
|
||||
deleted_livelists_count_blocks(spa, &zcb);
|
||||
|
||||
if (dump_opt['c'] > 1)
|
||||
flags |= TRAVERSE_PREFETCH_DATA;
|
||||
|
||||
@ -5706,6 +5908,7 @@ dump_mos_leaks(spa_t *spa)
|
||||
mos_obj_refd(vim->vim_phys->vimp_counts_object);
|
||||
vdev_indirect_mapping_close(vim);
|
||||
}
|
||||
deleted_livelists_dump_mos(spa);
|
||||
|
||||
if (dp->dp_origin_snap != NULL) {
|
||||
dsl_dataset_t *ds;
|
||||
@ -5715,12 +5918,12 @@ dump_mos_leaks(spa_t *spa)
|
||||
dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
|
||||
FTAG, &ds));
|
||||
count_ds_mos_objects(ds);
|
||||
dump_deadlist(&ds->ds_deadlist);
|
||||
dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
dsl_pool_config_exit(dp, FTAG);
|
||||
|
||||
count_ds_mos_objects(dp->dp_origin_snap);
|
||||
dump_deadlist(&dp->dp_origin_snap->ds_deadlist);
|
||||
dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
|
||||
}
|
||||
count_dir_mos_objects(dp->dp_mos_dir);
|
||||
if (dp->dp_free_dir != NULL)
|
||||
@ -5885,7 +6088,7 @@ dump_zpool(spa_t *spa)
|
||||
if (dump_opt['d'] || dump_opt['i']) {
|
||||
spa_feature_t f;
|
||||
mos_refd_objs = range_tree_create(NULL, NULL);
|
||||
dump_dir(dp->dp_meta_objset);
|
||||
dump_objset(dp->dp_meta_objset);
|
||||
|
||||
if (dump_opt['d'] >= 3) {
|
||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||
@ -5915,8 +6118,9 @@ dump_zpool(spa_t *spa)
|
||||
global_feature_count[f] = UINT64_MAX;
|
||||
global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
|
||||
global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
|
||||
global_feature_count[SPA_FEATURE_LIVELIST] = 0;
|
||||
|
||||
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
|
||||
(void) dmu_objset_find(spa_name(spa), dump_one_objset,
|
||||
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
|
||||
|
||||
if (rc == 0 && !dump_opt['L'])
|
||||
@ -6777,9 +6981,9 @@ main(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
if (os != NULL) {
|
||||
dump_dir(os);
|
||||
dump_objset(os);
|
||||
} else if (zopt_objects > 0 && !dump_opt['m']) {
|
||||
dump_dir(spa->spa_meta_objset);
|
||||
dump_objset(spa->spa_meta_objset);
|
||||
} else {
|
||||
dump_zpool(spa);
|
||||
}
|
||||
|
@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_BPLIST_H
|
||||
@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl);
|
||||
void bplist_append(bplist_t *bpl, const blkptr_t *bp);
|
||||
void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
|
||||
void *arg, dmu_tx_t *tx);
|
||||
void bplist_clear(bplist_t *bpl);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2015, 2019 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_BPOBJ_H
|
||||
@ -31,6 +31,7 @@
|
||||
#include <sys/txg.h>
|
||||
#include <sys/zio.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/bplist.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@ -48,10 +49,12 @@ typedef struct bpobj_phys {
|
||||
uint64_t bpo_uncomp;
|
||||
uint64_t bpo_subobjs;
|
||||
uint64_t bpo_num_subobjs;
|
||||
uint64_t bpo_num_freed;
|
||||
} bpobj_phys_t;
|
||||
|
||||
#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
|
||||
#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
|
||||
#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))
|
||||
|
||||
typedef struct bpobj {
|
||||
kmutex_t bpo_lock;
|
||||
@ -60,12 +63,14 @@ typedef struct bpobj {
|
||||
int bpo_epb;
|
||||
uint8_t bpo_havecomp;
|
||||
uint8_t bpo_havesubobj;
|
||||
uint8_t bpo_havefreed;
|
||||
bpobj_phys_t *bpo_phys;
|
||||
dmu_buf_t *bpo_dbuf;
|
||||
dmu_buf_t *bpo_cached_dbuf;
|
||||
} bpobj_t;
|
||||
|
||||
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
|
||||
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
|
||||
@ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo);
|
||||
boolean_t bpobj_is_open(const bpobj_t *bpo);
|
||||
|
||||
int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
|
||||
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
|
||||
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *);
|
||||
int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
|
||||
void *arg, int64_t start);
|
||||
|
||||
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
|
||||
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
int bpobj_space(bpobj_t *bpo,
|
||||
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
|
||||
@ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
|
||||
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
|
||||
boolean_t bpobj_is_empty(bpobj_t *bpo);
|
||||
|
||||
int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -383,6 +383,7 @@ typedef struct dmu_buf {
|
||||
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
|
||||
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
|
||||
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"
|
||||
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"
|
||||
|
||||
/*
|
||||
* Allocate an object from this objset. The range of object numbers
|
||||
@ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os);
|
||||
extern uint64_t dmu_objset_dnodesize(objset_t *os);
|
||||
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
|
||||
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
|
||||
extern int dmu_objset_blksize(objset_t *os);
|
||||
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
|
||||
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
|
||||
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
|
||||
|
@ -126,7 +126,7 @@ struct objset {
|
||||
zfs_cache_type_t os_secondary_cache;
|
||||
zfs_sync_type_t os_sync;
|
||||
zfs_redundant_metadata_type_t os_redundant_metadata;
|
||||
int os_recordsize;
|
||||
uint64_t os_recordsize;
|
||||
/*
|
||||
* The next four values are used as a cache of whatever's on disk, and
|
||||
* are initialized the first time these properties are queried. Before
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2018, 2019 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_DSL_DEADLIST_H
|
||||
@ -28,12 +28,14 @@
|
||||
|
||||
#include <sys/bpobj.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/zthr.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct dmu_buf;
|
||||
struct dsl_pool;
|
||||
struct dsl_dataset;
|
||||
|
||||
typedef struct dsl_deadlist_phys {
|
||||
@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry {
|
||||
bpobj_t dle_bpobj;
|
||||
} dsl_deadlist_entry_t;
|
||||
|
||||
typedef struct livelist_condense_entry {
|
||||
struct dsl_dataset *ds;
|
||||
dsl_deadlist_entry_t *first;
|
||||
dsl_deadlist_entry_t *next;
|
||||
boolean_t syncing;
|
||||
boolean_t cancelled;
|
||||
} livelist_condense_entry_t;
|
||||
|
||||
extern unsigned long zfs_livelist_max_entries;
|
||||
extern int zfs_livelist_min_percent_shared;
|
||||
|
||||
typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle);
|
||||
|
||||
void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
|
||||
void dsl_deadlist_close(dsl_deadlist_t *dl);
|
||||
void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg);
|
||||
uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
|
||||
void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
|
||||
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp,
|
||||
boolean_t free, dmu_tx_t *tx);
|
||||
int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
|
||||
void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
|
||||
void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg,
|
||||
dmu_tx_t *tx);
|
||||
dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl);
|
||||
dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl);
|
||||
uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
|
||||
uint64_t mrs_obj, dmu_tx_t *tx);
|
||||
void dsl_deadlist_space(dsl_deadlist_t *dl,
|
||||
@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
|
||||
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
|
||||
dmu_tx_t *tx);
|
||||
boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
|
||||
int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
|
||||
zthr_t *t, uint64_t *size);
|
||||
void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -33,6 +33,7 @@ extern "C" {
|
||||
|
||||
struct nvlist;
|
||||
struct dsl_dataset;
|
||||
struct dsl_pool;
|
||||
struct dmu_tx;
|
||||
|
||||
int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
*/
|
||||
@ -29,18 +29,20 @@
|
||||
#define _SYS_DSL_DIR_H
|
||||
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/dsl_deadlist.h>
|
||||
#include <sys/dsl_pool.h>
|
||||
#include <sys/dsl_synctask.h>
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/dsl_crypt.h>
|
||||
#include <sys/bplist.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct dsl_dataset;
|
||||
|
||||
struct zthr;
|
||||
/*
|
||||
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
|
||||
* They should be of the format <reverse-dns>:<field>.
|
||||
@ -49,6 +51,7 @@ struct dsl_dataset;
|
||||
#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
|
||||
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
|
||||
#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj"
|
||||
#define DD_FIELD_LIVELIST "com.delphix:livelist"
|
||||
|
||||
typedef enum dd_used {
|
||||
DD_USED_HEAD,
|
||||
@ -114,6 +117,10 @@ struct dsl_dir {
|
||||
/* amount of space we expect to write; == amount of dirty data */
|
||||
int64_t dd_space_towrite[TXG_SIZE];
|
||||
|
||||
dsl_deadlist_t dd_livelist;
|
||||
bplist_t dd_pending_frees;
|
||||
bplist_t dd_pending_allocs;
|
||||
|
||||
/* protected by dd_lock; keep at end of struct for better locality */
|
||||
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
|
||||
};
|
||||
@ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
|
||||
dmu_tx_t *tx);
|
||||
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
|
||||
boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
|
||||
void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj);
|
||||
void dsl_dir_livelist_close(dsl_dir_t *dd);
|
||||
void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total);
|
||||
|
||||
/* internal reserved dir name */
|
||||
#define MOS_DIR_NAME "$MOS"
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
|
||||
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
@ -54,6 +54,7 @@ struct dsl_pool;
|
||||
struct dmu_tx;
|
||||
struct dsl_scan;
|
||||
struct dsl_crypto_params;
|
||||
struct dsl_deadlist;
|
||||
|
||||
extern unsigned long zfs_dirty_data_max;
|
||||
extern unsigned long zfs_dirty_data_max_max;
|
||||
|
@ -63,6 +63,8 @@ typedef struct ddt ddt_t;
|
||||
typedef struct ddt_entry ddt_entry_t;
|
||||
typedef struct zbookmark_phys zbookmark_phys_t;
|
||||
|
||||
struct bpobj;
|
||||
struct bplist;
|
||||
struct dsl_pool;
|
||||
struct dsl_dataset;
|
||||
struct dsl_crypto_params;
|
||||
@ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0)
|
||||
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
|
||||
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
|
||||
|
||||
#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
|
||||
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)
|
||||
|
||||
#define BP_PHYSICAL_BIRTH(bp) \
|
||||
(BP_IS_EMBEDDED(bp) ? 0 : \
|
||||
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
|
||||
@ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0)
|
||||
* 'func' is either snprintf() or mdb_snprintf().
|
||||
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
|
||||
*/
|
||||
|
||||
#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
|
||||
{ \
|
||||
static const char *copyname[] = \
|
||||
@ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool);
|
||||
extern void spa_inject_delref(spa_t *spa);
|
||||
extern void spa_scan_stat_init(spa_t *spa);
|
||||
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
|
||||
extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
|
||||
#define SPA_ASYNC_CONFIG_UPDATE 0x01
|
||||
#define SPA_ASYNC_REMOVE 0x02
|
||||
@ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa);
|
||||
extern boolean_t spa_multihost(spa_t *spa);
|
||||
extern unsigned long spa_get_hostid(void);
|
||||
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
|
||||
extern boolean_t spa_livelist_delete_check(spa_t *spa);
|
||||
|
||||
extern int spa_mode(spa_t *spa);
|
||||
extern uint64_t zfs_strtonum(const char *str, char **nptr);
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include <sys/dsl_crypt.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/zthr.h>
|
||||
#include <sys/dsl_deadlist.h>
|
||||
#include <zfeature_common.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -317,6 +318,11 @@ struct spa {
|
||||
list_t spa_log_summary;
|
||||
uint64_t spa_log_flushall_txg;
|
||||
|
||||
zthr_t *spa_livelist_delete_zthr; /* deleting livelists */
|
||||
zthr_t *spa_livelist_condense_zthr; /* condensing livelists */
|
||||
uint64_t spa_livelists_to_delete; /* set of livelists to free */
|
||||
livelist_condense_entry_t spa_to_condense; /* next to condense */
|
||||
|
||||
char *spa_root; /* alternate root directory */
|
||||
uint64_t spa_ena; /* spa-wide ereport ENA */
|
||||
int spa_last_open_failed; /* error if last open failed */
|
||||
|
@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t);
|
||||
extern void zthr_wakeup(zthr_t *t);
|
||||
extern void zthr_cancel(zthr_t *t);
|
||||
extern void zthr_resume(zthr_t *t);
|
||||
extern void zthr_wait_cycle_done(zthr_t *t);
|
||||
|
||||
extern boolean_t zthr_iscancelled(zthr_t *t);
|
||||
extern boolean_t zthr_has_waiters(zthr_t *t);
|
||||
|
||||
#endif /* _SYS_ZTHR_H */
|
||||
|
@ -71,6 +71,7 @@ typedef enum spa_feature {
|
||||
SPA_FEATURE_REDACTED_DATASETS,
|
||||
SPA_FEATURE_BOOKMARK_WRITTEN,
|
||||
SPA_FEATURE_LOG_SPACEMAP,
|
||||
SPA_FEATURE_LIVELIST,
|
||||
SPA_FEATURES
|
||||
} spa_feature_t;
|
||||
|
||||
|
@ -1909,6 +1909,98 @@ Pattern written to vdev free space by \fBzpool initialize\fR.
|
||||
Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee).
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_max_entries\fR (ulong)
|
||||
.ad
|
||||
.RS 12n
|
||||
The threshold size (in block pointers) at which we create a new sub-livelist.
|
||||
Larger sublists are more costly from a memory perspective but the fewer
|
||||
sublists there are, the lower the cost of insertion.
|
||||
.sp
|
||||
Default value: \fB500,000\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_min_percent_shared\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
If the amount of shared space between a snapshot and its clone drops below
|
||||
this threshold, the clone turns off the livelist and reverts to the old deletion
|
||||
method. This is in place because once a clone has been overwritten enough
|
||||
livelists no long give us a benefit.
|
||||
.sp
|
||||
Default value: \fB75\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_condense_new_alloc\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
Incremented each time an extra ALLOC blkptr is added to a livelist entry while
|
||||
it is being condensed.
|
||||
This option is used by the test suite to track race conditions.
|
||||
.sp
|
||||
Default value: \fB0\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_condense_sync_cancel\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
Incremented each time livelist condensing is canceled while in
|
||||
spa_livelist_condense_sync.
|
||||
This option is used by the test suite to track race conditions.
|
||||
.sp
|
||||
Default value: \fB0\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_condense_sync_pause\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
When set, the livelist condense process pauses indefinitely before
|
||||
executing the synctask - spa_livelist_condense_sync.
|
||||
This option is used by the test suite to trigger race conditions.
|
||||
.sp
|
||||
Default value: \fB0\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_condense_zthr_cancel\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
Incremented each time livelist condensing is canceled while in
|
||||
spa_livelist_condense_cb.
|
||||
This option is used by the test suite to track race conditions.
|
||||
.sp
|
||||
Default value: \fB0\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_livelist_condense_zthr_pause\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
When set, the livelist condense process pauses indefinitely before
|
||||
executing the open context condensing work in spa_livelist_condense_cb.
|
||||
This option is used by the test suite to trigger race conditions.
|
||||
.sp
|
||||
Default value: \fB0\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
|
@ -547,6 +547,26 @@ allow more data to be stored in the bonus buffer, thus potentially
|
||||
improving performance by avoiding the use of spill blocks.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fB\fBlivelist\fR\fR
|
||||
.ad
|
||||
.RS 4n
|
||||
.TS
|
||||
l l .
|
||||
GUID com.delphix:livelist
|
||||
READ\-ONLY COMPATIBLE yes
|
||||
DEPENDENCIES none
|
||||
.TE
|
||||
This feature allows clones to be deleted faster than the traditional method
|
||||
when a large number of random/sparse writes have been made to the clone.
|
||||
All blocks allocated and freed after a clone is created are tracked by the
|
||||
the clone's livelist which is referenced during the deletion of the clone.
|
||||
The feature is activated when a clone is created and remains active until all
|
||||
clones have been destroyed.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
@ -882,7 +902,6 @@ This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand
|
||||
is used to checkpoint the pool.
|
||||
The feature will only return back to being \fBenabled\fR when the pool
|
||||
is rewound or the checkpoint has been discarded.
|
||||
.RE
|
||||
|
||||
.SH "SEE ALSO"
|
||||
zpool(8)
|
||||
|
@ -348,6 +348,18 @@ zpool_feature_init(void)
|
||||
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
|
||||
ZFEATURE_TYPE_BOOLEAN, NULL);
|
||||
|
||||
{
|
||||
static const spa_feature_t livelist_deps[] = {
|
||||
SPA_FEATURE_EXTENSIBLE_DATASET,
|
||||
SPA_FEATURE_NONE
|
||||
};
|
||||
zfeature_register(SPA_FEATURE_LIVELIST,
|
||||
"com.delphix:livelist", "livelist",
|
||||
"Improved clone deletion performance.",
|
||||
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
|
||||
livelist_deps);
|
||||
}
|
||||
|
||||
{
|
||||
static const spa_feature_t log_spacemap_deps[] = {
|
||||
SPA_FEATURE_SPACEMAP_V2,
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/bplist.h>
|
||||
@ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
|
||||
}
|
||||
mutex_exit(&bpl->bpl_lock);
|
||||
}
|
||||
|
||||
void
|
||||
bplist_clear(bplist_t *bpl)
|
||||
{
|
||||
bplist_entry_t *bpe;
|
||||
|
||||
mutex_enter(&bpl->bpl_lock);
|
||||
while ((bpe = list_head(&bpl->bpl_list))) {
|
||||
bplist_iterate_last_removed = bpe;
|
||||
list_remove(&bpl->bpl_list, bpe);
|
||||
kmem_free(bpe, sizeof (*bpe));
|
||||
}
|
||||
mutex_exit(&bpl->bpl_lock);
|
||||
}
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2017 Datto Inc.
|
||||
*/
|
||||
|
||||
@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
|
||||
size = BPOBJ_SIZE_V0;
|
||||
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
|
||||
size = BPOBJ_SIZE_V1;
|
||||
else if (!spa_feature_is_active(dmu_objset_spa(os),
|
||||
SPA_FEATURE_LIVELIST))
|
||||
size = BPOBJ_SIZE_V2;
|
||||
else
|
||||
size = sizeof (bpobj_phys_t);
|
||||
|
||||
@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
|
||||
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
|
||||
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
|
||||
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
|
||||
bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
|
||||
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
|
||||
return (0);
|
||||
}
|
||||
@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
|
||||
* Update bpobj and all of its parents with new space accounting.
|
||||
*/
|
||||
static void
|
||||
propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
|
||||
uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
|
||||
propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
|
||||
int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
|
||||
for (; bpi != NULL; bpi = bpi->bpi_parent) {
|
||||
@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
|
||||
|
||||
static int
|
||||
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
dmu_tx_t *tx, boolean_t free)
|
||||
int64_t start, dmu_tx_t *tx, boolean_t free)
|
||||
{
|
||||
int err = 0;
|
||||
uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
|
||||
int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
|
||||
dmu_buf_t *dbuf = NULL;
|
||||
bpobj_t *bpo = bpi->bpi_bpo;
|
||||
|
||||
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
|
||||
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
|
||||
uint64_t offset = i * sizeof (blkptr_t);
|
||||
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
|
||||
|
||||
if (dbuf == NULL || dbuf->db_offset > offset) {
|
||||
if (dbuf)
|
||||
dmu_buf_rele(dbuf, FTAG);
|
||||
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
|
||||
FTAG, &dbuf, 0);
|
||||
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
|
||||
offset, FTAG, &dbuf, 0);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
|
||||
blkptr_t *bparray = dbuf->db_data;
|
||||
blkptr_t *bp = &bparray[blkoff];
|
||||
err = func(arg, bp, tx);
|
||||
|
||||
boolean_t bp_freed = BP_GET_FREE(bp);
|
||||
err = func(arg, bp, bp_freed, tx);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
if (free) {
|
||||
int sign = bp_freed ? -1 : +1;
|
||||
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
|
||||
freed += bp_get_dsize_sync(spa, bp);
|
||||
comp_freed += BP_GET_PSIZE(bp);
|
||||
uncomp_freed += BP_GET_UCSIZE(bp);
|
||||
freed += sign * bp_get_dsize_sync(spa, bp);
|
||||
comp_freed += sign * BP_GET_PSIZE(bp);
|
||||
uncomp_freed += sign * BP_GET_UCSIZE(bp);
|
||||
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
|
||||
bpo->bpo_phys->bpo_num_blkptrs--;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
|
||||
if (bp_freed) {
|
||||
ASSERT(bpo->bpo_havefreed);
|
||||
bpo->bpo_phys->bpo_num_freed--;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (free) {
|
||||
@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
*/
|
||||
static int
|
||||
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
dmu_tx_t *tx, boolean_t free)
|
||||
dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
|
||||
{
|
||||
list_t stack;
|
||||
bpobj_info_t *bpi;
|
||||
@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
list_create(&stack, sizeof (bpobj_info_t),
|
||||
offsetof(bpobj_info_t, bpi_node));
|
||||
mutex_enter(&initial_bpo->bpo_lock);
|
||||
|
||||
if (bpobj_size != NULL)
|
||||
*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
|
||||
|
||||
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
|
||||
|
||||
while ((bpi = list_head(&stack)) != NULL) {
|
||||
@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
|
||||
if (bpi->bpi_visited == B_FALSE) {
|
||||
err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
|
||||
err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
|
||||
free);
|
||||
bpi->bpi_visited = B_TRUE;
|
||||
if (err != 0)
|
||||
break;
|
||||
@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
* We have unprocessed subobjs. Process the next one.
|
||||
*/
|
||||
ASSERT(bpo->bpo_havecomp);
|
||||
ASSERT3P(bpobj_size, ==, NULL);
|
||||
|
||||
/* Add the last subobj to stack. */
|
||||
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
|
||||
@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
int
|
||||
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate the entries. If func returns nonzero, iteration will stop.
|
||||
*
|
||||
* If there are no subobjs:
|
||||
*
|
||||
* *bpobj_size can be used to return the number of block pointers in the
|
||||
* bpobj. Note that this may be different from the number of block pointers
|
||||
* that are iterated over, if iteration is terminated early (e.g. by the func
|
||||
* returning nonzero).
|
||||
*
|
||||
* If there are concurrent (or subsequent) modifications to the bpobj then the
|
||||
* returned *bpobj_size can be passed as "start" to
|
||||
* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
|
||||
*/
|
||||
int
|
||||
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
|
||||
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
|
||||
uint64_t *bpobj_size)
|
||||
{
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
|
||||
return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over the blkptrs in the bpobj beginning at index start. If func
|
||||
* returns nonzero, iteration will stop. This is a livelist specific function
|
||||
* since it assumes that there are no subobjs present.
|
||||
*/
|
||||
int
|
||||
livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
|
||||
int64_t start)
|
||||
{
|
||||
if (bpo->bpo_havesubobj)
|
||||
VERIFY0(bpo->bpo_phys->bpo_subobjs);
|
||||
bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
|
||||
int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
|
||||
kmem_free(bpi, sizeof (bpobj_info_t));
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
void
|
||||
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
blkptr_t stored_bp = *bp;
|
||||
uint64_t offset;
|
||||
@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
|
||||
}
|
||||
|
||||
/* We never need the fill count. */
|
||||
stored_bp.blk_fill = 0;
|
||||
BP_SET_FREE(&stored_bp, bp_freed);
|
||||
|
||||
mutex_enter(&bpo->bpo_lock);
|
||||
|
||||
@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
bpo->bpo_phys->bpo_num_blkptrs++;
|
||||
bpo->bpo_phys->bpo_bytes +=
|
||||
int sign = bp_freed ? -1 : +1;
|
||||
bpo->bpo_phys->bpo_bytes += sign *
|
||||
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
|
||||
if (bpo->bpo_havecomp) {
|
||||
bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
|
||||
bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
|
||||
bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
|
||||
bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
|
||||
}
|
||||
if (bp_freed) {
|
||||
ASSERT(bpo->bpo_havefreed);
|
||||
bpo->bpo_phys->bpo_num_freed++;
|
||||
}
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
}
|
||||
@ -799,7 +852,7 @@ struct space_range_arg {
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
struct space_range_arg *sra = arg;
|
||||
|
||||
@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
|
||||
*uncompp = sra.uncomp;
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
|
||||
* bpobj are designated as free or allocated that information is not preserved
|
||||
* in bplists.
|
||||
*/
|
||||
/* ARGSUSED */
|
||||
int
|
||||
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
bplist_t *bpl = arg;
|
||||
bplist_append(bpl, bp);
|
||||
return (0);
|
||||
}
|
||||
|
@ -3286,6 +3286,13 @@ dbuf_hold_impl_arg(struct dbuf_hold_arg *dh)
|
||||
|
||||
*(dh->dh_dbp) = NULL;
|
||||
|
||||
/* If the pool has been created, verify the tx_sync_lock is not held */
|
||||
spa_t *spa = dh->dh_dn->dn_objset->os_spa;
|
||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||
if (dp != NULL) {
|
||||
ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
|
||||
}
|
||||
|
||||
/* dbuf_find() returns with db_mtx held */
|
||||
dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
|
||||
dh->dh_level, dh->dh_blkid);
|
||||
@ -4479,6 +4486,29 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
|
||||
drica.drica_tx = tx;
|
||||
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
|
||||
&drica)) {
|
||||
/*
|
||||
* If the blkptr being remapped is tracked by a livelist,
|
||||
* then we need to make sure the livelist reflects the update.
|
||||
* First, cancel out the old blkptr by appending a 'FREE'
|
||||
* entry. Next, add an 'ALLOC' to track the new version. This
|
||||
* way we avoid trying to free an inaccurate blkptr at delete.
|
||||
* Note that embedded blkptrs are not tracked in livelists.
|
||||
*/
|
||||
if (dn->dn_objset != spa_meta_objset(spa)) {
|
||||
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
|
||||
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
|
||||
bp->blk_birth > ds->ds_dir->dd_origin_txg) {
|
||||
ASSERT(!BP_IS_EMBEDDED(bp));
|
||||
ASSERT(dsl_dir_is_clone(ds->ds_dir));
|
||||
ASSERT(spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_LIVELIST));
|
||||
bplist_append(&ds->ds_dir->dd_pending_frees,
|
||||
bp);
|
||||
bplist_append(&ds->ds_dir->dd_pending_allocs,
|
||||
&bp_copy);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The db_rwlock prevents dbuf_read_impl() from
|
||||
* dereferencing the BP while we are changing it. To
|
||||
|
@ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
|
||||
void
|
||||
dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
int used, compressed, uncompressed;
|
||||
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
||||
int used = bp_get_dsize_sync(spa, bp);
|
||||
int compressed = BP_GET_PSIZE(bp);
|
||||
int uncompressed = BP_GET_UCSIZE(bp);
|
||||
int64_t delta;
|
||||
|
||||
used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
|
||||
compressed = BP_GET_PSIZE(bp);
|
||||
uncompressed = BP_GET_UCSIZE(bp);
|
||||
|
||||
dprintf_bp(bp, "ds=%p", ds);
|
||||
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
@ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
ds->ds_feature_activation[f] = (void *)B_TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Track block for livelist, but ignore embedded blocks because
|
||||
* they do not need to be freed.
|
||||
*/
|
||||
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
|
||||
bp->blk_birth > ds->ds_dir->dd_origin_txg &&
|
||||
!(BP_IS_EMBEDDED(bp))) {
|
||||
ASSERT(dsl_dir_is_clone(ds->ds_dir));
|
||||
ASSERT(spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_LIVELIST));
|
||||
bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
|
||||
}
|
||||
|
||||
mutex_exit(&ds->ds_lock);
|
||||
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
|
||||
compressed, uncompressed, tx);
|
||||
@ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
|
||||
DVA_SET_VDEV(dva, vdev);
|
||||
DVA_SET_OFFSET(dva, offset);
|
||||
DVA_SET_ASIZE(dva, size);
|
||||
|
||||
dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
|
||||
dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
|
||||
tx);
|
||||
}
|
||||
}
|
||||
|
||||
@ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
|
||||
ASSERT(!ds->ds_is_snapshot);
|
||||
dmu_buf_will_dirty(ds->ds_dbuf, tx);
|
||||
|
||||
/*
|
||||
* Track block for livelist, but ignore embedded blocks because
|
||||
* they do not need to be freed.
|
||||
*/
|
||||
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
|
||||
bp->blk_birth > ds->ds_dir->dd_origin_txg &&
|
||||
!(BP_IS_EMBEDDED(bp))) {
|
||||
ASSERT(dsl_dir_is_clone(ds->ds_dir));
|
||||
ASSERT(spa_feature_is_enabled(spa,
|
||||
SPA_FEATURE_LIVELIST));
|
||||
bplist_append(&ds->ds_dir->dd_pending_frees, bp);
|
||||
}
|
||||
|
||||
if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
|
||||
int64_t delta;
|
||||
|
||||
@ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
|
||||
*/
|
||||
bplist_append(&ds->ds_pending_deadlist, bp);
|
||||
} else {
|
||||
dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
|
||||
dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
|
||||
}
|
||||
ASSERT3U(ds->ds_prev->ds_object, ==,
|
||||
dsl_dataset_phys(ds)->ds_prev_snap_obj);
|
||||
@ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
|
||||
|
||||
ASSERT(dmu_tx_is_syncing(tx));
|
||||
ASSERT(lastname[0] != '@');
|
||||
/*
|
||||
* Filesystems will eventually have their origin set to dp_origin_snap,
|
||||
* but that's taken care of in dsl_dataset_create_sync_dd. When
|
||||
* creating a filesystem, this function is called with origin equal to
|
||||
* NULL.
|
||||
*/
|
||||
if (origin != NULL)
|
||||
ASSERT3P(origin, !=, dp->dp_origin_snap);
|
||||
|
||||
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
|
||||
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
|
||||
@ -1250,6 +1283,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
|
||||
|
||||
dsl_deleg_set_create_perms(dd, tx, cr);
|
||||
|
||||
/*
|
||||
* If we are creating a clone and the livelist feature is enabled,
|
||||
* add the entry DD_FIELD_LIVELIST to ZAP.
|
||||
*/
|
||||
if (origin != NULL &&
|
||||
spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
|
||||
objset_t *mos = dd->dd_pool->dp_meta_objset;
|
||||
dsl_dir_zapify(dd, tx);
|
||||
uint64_t obj = dsl_deadlist_alloc(mos, tx);
|
||||
VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
|
||||
sizeof (uint64_t), 1, &obj, tx));
|
||||
spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since we're creating a new node we know it's a leaf, so we can
|
||||
* initialize the counts if the limit feature is active.
|
||||
@ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
/*
|
||||
* Check if the percentage of blocks shared between the clone and the
|
||||
* snapshot (as opposed to those that are clone only) is below a certain
|
||||
* threshold
|
||||
*/
|
||||
boolean_t
|
||||
dsl_livelist_should_disable(dsl_dataset_t *ds)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, tx);
|
||||
return (0);
|
||||
uint64_t used, referenced;
|
||||
int percent_shared;
|
||||
|
||||
used = dsl_dir_get_usedds(ds->ds_dir);
|
||||
referenced = dsl_get_referenced(ds);
|
||||
ASSERT3U(referenced, >=, 0);
|
||||
ASSERT3U(used, >=, 0);
|
||||
if (referenced == 0)
|
||||
return (B_FALSE);
|
||||
percent_shared = (100 * (referenced - used)) / referenced;
|
||||
if (percent_shared <= zfs_livelist_min_percent_shared)
|
||||
return (B_TRUE);
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if it is possible to combine two livelist entries into one.
|
||||
* This is the case if the combined number of 'live' blkptrs (ALLOCs that
|
||||
* don't have a matching FREE) is under the maximum sublist size.
|
||||
* We check this by subtracting twice the total number of frees from the total
|
||||
* number of blkptrs. FREEs are counted twice because each FREE blkptr
|
||||
* will cancel out an ALLOC blkptr when the livelist is processed.
|
||||
*/
|
||||
static boolean_t
|
||||
dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
|
||||
dsl_deadlist_entry_t *next)
|
||||
{
|
||||
uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
|
||||
next->dle_bpobj.bpo_phys->bpo_num_freed;
|
||||
uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
|
||||
next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
|
||||
if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
|
||||
return (B_TRUE);
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
typedef struct try_condense_arg {
|
||||
spa_t *spa;
|
||||
dsl_dataset_t *ds;
|
||||
} try_condense_arg_t;
|
||||
|
||||
/*
|
||||
* Iterate over the livelist entries, searching for a pair to condense.
|
||||
* A nonzero return value means stop, 0 means keep looking.
|
||||
*/
|
||||
static int
|
||||
dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
|
||||
{
|
||||
try_condense_arg_t *tca = arg;
|
||||
spa_t *spa = tca->spa;
|
||||
dsl_dataset_t *ds = tca->ds;
|
||||
dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
|
||||
dsl_deadlist_entry_t *next;
|
||||
|
||||
/* The condense thread has not yet been created at import */
|
||||
if (spa->spa_livelist_condense_zthr == NULL)
|
||||
return (1);
|
||||
|
||||
/* A condense is already in progress */
|
||||
if (spa->spa_to_condense.ds != NULL)
|
||||
return (1);
|
||||
|
||||
next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
|
||||
/* The livelist has only one entry - don't condense it */
|
||||
if (next == NULL)
|
||||
return (1);
|
||||
|
||||
/* Next is the newest entry - don't condense it */
|
||||
if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
|
||||
return (1);
|
||||
|
||||
/* This pair is not ready to condense but keep looking */
|
||||
if (!dsl_livelist_should_condense(first, next))
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* Add a ref to prevent the dataset from being evicted while
|
||||
* the condense zthr or synctask are running. Ref will be
|
||||
* released at the end of the condense synctask
|
||||
*/
|
||||
dmu_buf_add_ref(ds->ds_dbuf, spa);
|
||||
|
||||
spa->spa_to_condense.ds = ds;
|
||||
spa->spa_to_condense.first = first;
|
||||
spa->spa_to_condense.next = next;
|
||||
spa->spa_to_condense.syncing = B_FALSE;
|
||||
spa->spa_to_condense.cancelled = B_FALSE;
|
||||
|
||||
zthr_wakeup(spa->spa_livelist_condense_zthr);
|
||||
return (1);
|
||||
}
|
||||
|
||||
static void
|
||||
dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_dir_t *dd = ds->ds_dir;
|
||||
spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
|
||||
dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
|
||||
|
||||
/* Check if we need to add a new sub-livelist */
|
||||
if (last == NULL) {
|
||||
/* The livelist is empty */
|
||||
dsl_deadlist_add_key(&dd->dd_livelist,
|
||||
tx->tx_txg - 1, tx);
|
||||
} else if (spa_sync_pass(spa) == 1) {
|
||||
/*
|
||||
* Check if the newest entry is full. If it is, make a new one.
|
||||
* We only do this once per sync because we could overfill a
|
||||
* sublist in one sync pass and don't want to add another entry
|
||||
* for a txg that is already represented. This ensures that
|
||||
* blkptrs born in the same txg are stored in the same sublist.
|
||||
*/
|
||||
bpobj_t bpobj = last->dle_bpobj;
|
||||
uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
|
||||
uint64_t free = bpobj.bpo_phys->bpo_num_freed;
|
||||
uint64_t alloc = all - free;
|
||||
if (alloc > zfs_livelist_max_entries) {
|
||||
dsl_deadlist_add_key(&dd->dd_livelist,
|
||||
tx->tx_txg - 1, tx);
|
||||
}
|
||||
}
|
||||
|
||||
/* Insert each entry into the on-disk livelist */
|
||||
bplist_iterate(&dd->dd_pending_allocs,
|
||||
dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
|
||||
bplist_iterate(&dd->dd_pending_frees,
|
||||
dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
|
||||
|
||||
/* Attempt to condense every pair of adjacent entries */
|
||||
try_condense_arg_t arg = {
|
||||
.spa = spa,
|
||||
.ds = ds
|
||||
};
|
||||
dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
|
||||
&arg);
|
||||
}
|
||||
|
||||
void
|
||||
@ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
objset_t *os = ds->ds_objset;
|
||||
|
||||
bplist_iterate(&ds->ds_pending_deadlist,
|
||||
deadlist_enqueue_cb, &ds->ds_deadlist, tx);
|
||||
dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
|
||||
|
||||
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
|
||||
dsl_flush_pending_livelist(ds, tx);
|
||||
if (dsl_livelist_should_disable(ds)) {
|
||||
dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
dsl_bookmark_sync_done(ds, tx);
|
||||
|
||||
@ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
|
||||
uint64_t oldnext_obj;
|
||||
int64_t delta;
|
||||
|
||||
ASSERT(nvlist_empty(ddpa->err_ds));
|
||||
|
||||
VERIFY0(promote_hold(ddpa, dp, FTAG));
|
||||
hds = ddpa->ddpa_clone;
|
||||
|
||||
@ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
|
||||
|
||||
dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
|
||||
|
||||
/*
|
||||
* Since livelists are specific to a clone's origin txg, they
|
||||
* are no longer accurate. Destroy the livelist from the clone being
|
||||
* promoted. If the origin dataset is a clone, destroy its livelist
|
||||
* as well.
|
||||
*/
|
||||
dsl_dir_remove_livelist(dd, tx, B_TRUE);
|
||||
dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE);
|
||||
|
||||
/* log history record */
|
||||
spa_history_log_internal_ds(hds, "promote", tx, "");
|
||||
|
||||
@ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
|
||||
|
||||
dsl_scan_ds_clone_swapped(origin_head, clone, tx);
|
||||
|
||||
/*
|
||||
* Destroy any livelists associated with the clone or the origin,
|
||||
* since after the swap the corresponding livelists are no longer
|
||||
* valid.
|
||||
*/
|
||||
dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
|
||||
dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
|
||||
|
||||
spa_history_log_internal_ds(clone, "clone swap", tx,
|
||||
"parent=%s", origin_head->ds_dir->dd_myname);
|
||||
}
|
||||
|
@ -20,16 +20,16 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/dsl_dataset.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/dsl_pool.h>
|
||||
#include <sys/dsl_dataset.h>
|
||||
|
||||
/*
|
||||
* Deadlist concurrency:
|
||||
@ -51,6 +51,68 @@
|
||||
* provides its own locking, and dl_oldfmt is immutable.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Livelist Overview
|
||||
* ================
|
||||
*
|
||||
* Livelists use the same 'deadlist_t' struct as deadlists and are also used
|
||||
* to track blkptrs over the lifetime of a dataset. Livelists however, belong
|
||||
* to clones and track the blkptrs that are clone-specific (were born after
|
||||
* the clone's creation). The exception is embedded block pointers which are
|
||||
* not included in livelists because they do not need to be freed.
|
||||
*
|
||||
* When it comes time to delete the clone, the livelist provides a quick
|
||||
* reference as to what needs to be freed. For this reason, livelists also track
|
||||
* when clone-specific blkptrs are freed before deletion to prevent double
|
||||
* frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
|
||||
* deletion algorithm iterates backwards over the livelist, matching
|
||||
* FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
|
||||
* are also updated in the case when blkptrs are remapped: the old version
|
||||
* of the blkptr is cancelled out with a FREE and the new version is tracked
|
||||
* with an ALLOC.
|
||||
*
|
||||
* To bound the amount of memory required for deletion, livelists over a
|
||||
* certain size are spread over multiple entries. Entries are grouped by
|
||||
* birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
|
||||
* be in the same entry. This allows us to delete livelists incrementally
|
||||
* over multiple syncs, one entry at a time.
|
||||
*
|
||||
* During the lifetime of the clone, livelists can get extremely large.
|
||||
* Their size is managed by periodic condensing (preemptively cancelling out
|
||||
* FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
|
||||
* the shared space between the clone and its origin is so small that it
|
||||
* doesn't make sense to use livelists anymore.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The threshold sublist size at which we create a new sub-livelist for the
|
||||
* next txg. However, since blkptrs of the same transaction group must be in
|
||||
* the same sub-list, the actual sublist size may exceed this. When picking the
|
||||
* size we had to balance the fact that larger sublists mean fewer sublists
|
||||
* (decreasing the cost of insertion) against the consideration that sublists
|
||||
* will be loaded into memory and shouldn't take up an inordinate amount of
|
||||
* space. We settled on ~500000 entries, corresponding to roughly 128M.
|
||||
*/
|
||||
unsigned long zfs_livelist_max_entries = 500000;
|
||||
|
||||
/*
|
||||
* We can approximate how much of a performance gain a livelist will give us
|
||||
* based on the percentage of blocks shared between the clone and its origin.
|
||||
* 0 percent shared means that the clone has completely diverged and that the
|
||||
* old method is maximally effective: every read from the block tree will
|
||||
* result in lots of frees. Livelists give us gains when they track blocks
|
||||
* scattered across the tree, when one read in the old method might only
|
||||
* result in a few frees. Once the clone has been overwritten enough,
|
||||
* writes are no longer sparse and we'll no longer get much of a benefit from
|
||||
* tracking them with a livelist. We chose a lower limit of 75 percent shared
|
||||
* (25 percent overwritten). This means that 1/4 of all block pointers will be
|
||||
* freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
|
||||
* to make deletion 4x faster. Once the amount of shared space drops below this
|
||||
* threshold, the clone will revert to the old deletion method.
|
||||
*/
|
||||
int zfs_livelist_min_percent_shared = 75;
|
||||
|
||||
|
||||
static int
|
||||
dsl_deadlist_compare(const void *arg1, const void *arg2)
|
||||
{
|
||||
@ -88,6 +150,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
|
||||
dl->dl_havetree = B_TRUE;
|
||||
}
|
||||
|
||||
void
|
||||
dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
|
||||
ASSERT(dsl_deadlist_is_open(dl));
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
for (dle = avl_first(&dl->dl_tree); dle != NULL;
|
||||
dle = AVL_NEXT(&dl->dl_tree, dle)) {
|
||||
if (func(args, dle) != 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
|
||||
{
|
||||
@ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
|
||||
|
||||
static void
|
||||
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||
const blkptr_t *bp, dmu_tx_t *tx)
|
||||
const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&dl->dl_lock));
|
||||
if (dle->dle_bpobj.bpo_object ==
|
||||
@ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||
VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
|
||||
dle->dle_mintxg, obj, tx));
|
||||
}
|
||||
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
|
||||
bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
|
||||
}
|
||||
|
||||
void
|
||||
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_entry_t dle_tofind;
|
||||
dsl_deadlist_entry_t *dle;
|
||||
avl_index_t where;
|
||||
|
||||
if (dl->dl_oldfmt) {
|
||||
bpobj_enqueue(&dl->dl_bpobj, bp, tx);
|
||||
bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_deadlist_load_tree(dl);
|
||||
|
||||
dmu_buf_will_dirty(dl->dl_dbuf, tx);
|
||||
|
||||
int sign = bp_freed ? -1 : +1;
|
||||
dl->dl_phys->dl_used +=
|
||||
bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
|
||||
dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
|
||||
dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
|
||||
sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
|
||||
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
|
||||
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
|
||||
|
||||
dle_tofind.dle_mintxg = bp->blk_birth;
|
||||
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
|
||||
@ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
ASSERT3P(dle, !=, NULL);
|
||||
dle_enqueue(dl, dle, bp, tx);
|
||||
dle_enqueue(dl, dle, bp, bp_freed, tx);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
int
|
||||
dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, B_FALSE, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, B_TRUE, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert new key in deadlist, which must be > all current entries.
|
||||
* mintxg is not inclusive.
|
||||
@ -316,6 +414,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a deadlist entry and all of its contents by removing the entry from
|
||||
* the deadlist's avl tree, freeing the entry's bpobj and adjusting the
|
||||
* deadlist's space accounting accordingly.
|
||||
*/
|
||||
void
|
||||
dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
|
||||
{
|
||||
uint64_t used, comp, uncomp;
|
||||
dsl_deadlist_entry_t dle_tofind;
|
||||
dsl_deadlist_entry_t *dle;
|
||||
objset_t *os = dl->dl_os;
|
||||
|
||||
if (dl->dl_oldfmt)
|
||||
return;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
|
||||
dle_tofind.dle_mintxg = mintxg;
|
||||
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
|
||||
VERIFY3P(dle, !=, NULL);
|
||||
|
||||
avl_remove(&dl->dl_tree, dle);
|
||||
VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
|
||||
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
|
||||
dl->dl_phys->dl_used -= used;
|
||||
dl->dl_phys->dl_comp -= comp;
|
||||
dl->dl_phys->dl_uncomp -= uncomp;
|
||||
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
|
||||
bpobj_decr_empty(os, tx);
|
||||
} else {
|
||||
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
|
||||
}
|
||||
bpobj_close(&dle->dle_bpobj);
|
||||
kmem_free(dle, sizeof (*dle));
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear out the contents of a deadlist_entry by freeing its bpobj,
|
||||
* replacing it with an empty bpobj and adjusting the deadlist's
|
||||
* space accounting
|
||||
*/
|
||||
void
|
||||
dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
uint64_t new_obj, used, comp, uncomp;
|
||||
objset_t *os = dl->dl_os;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
|
||||
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
|
||||
dl->dl_phys->dl_used -= used;
|
||||
dl->dl_phys->dl_comp -= comp;
|
||||
dl->dl_phys->dl_uncomp -= uncomp;
|
||||
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
|
||||
bpobj_decr_empty(os, tx);
|
||||
else
|
||||
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
|
||||
bpobj_close(&dle->dle_bpobj);
|
||||
new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
|
||||
VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
|
||||
VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
|
||||
new_obj, tx));
|
||||
ASSERT(bpobj_is_empty(&dle->dle_bpobj));
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the first entry in deadlist's avl tree
|
||||
*/
|
||||
dsl_deadlist_entry_t *
|
||||
dsl_deadlist_first(dsl_deadlist_t *dl)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
dle = avl_first(&dl->dl_tree);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
|
||||
return (dle);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the last entry in deadlist's avl tree
|
||||
*/
|
||||
dsl_deadlist_entry_t *
|
||||
dsl_deadlist_last(dsl_deadlist_t *dl)
|
||||
{
|
||||
dsl_deadlist_entry_t *dle;
|
||||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
dsl_deadlist_load_tree(dl);
|
||||
dle = avl_last(&dl->dl_tree);
|
||||
mutex_exit(&dl->dl_lock);
|
||||
|
||||
return (dle);
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk ds's snapshots to regenerate generate ZAP & AVL.
|
||||
*/
|
||||
@ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
dsl_deadlist_t *dl = arg;
|
||||
dsl_deadlist_insert(dl, bp, tx);
|
||||
dsl_deadlist_insert(dl, bp, bp_freed, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
|
||||
}
|
||||
mutex_exit(&dl->dl_lock);
|
||||
}
|
||||
|
||||
typedef struct livelist_entry {
|
||||
const blkptr_t *le_bp;
|
||||
avl_node_t le_node;
|
||||
} livelist_entry_t;
|
||||
|
||||
static int
|
||||
livelist_compare(const void *larg, const void *rarg)
|
||||
{
|
||||
const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
|
||||
const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
|
||||
|
||||
/* Sort them according to dva[0] */
|
||||
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
|
||||
uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
|
||||
|
||||
if (l_dva0_vdev != r_dva0_vdev)
|
||||
return (AVL_CMP(l_dva0_vdev, r_dva0_vdev));
|
||||
|
||||
/* if vdevs are equal, sort by offsets. */
|
||||
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
|
||||
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
|
||||
if (l_dva0_offset == r_dva0_offset)
|
||||
ASSERT3U(l->blk_birth, ==, r->blk_birth);
|
||||
return (AVL_CMP(l_dva0_offset, r_dva0_offset));
|
||||
}
|
||||
|
||||
struct livelist_iter_arg {
|
||||
avl_tree_t *avl;
|
||||
bplist_t *to_free;
|
||||
zthr_t *t;
|
||||
};
|
||||
|
||||
/*
|
||||
* Expects an AVL tree which is incrementally filled will FREE blkptrs
|
||||
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
|
||||
* corresponding FREE are stored in the supplied bplist.
|
||||
*/
|
||||
static int
|
||||
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
struct livelist_iter_arg *lia = arg;
|
||||
avl_tree_t *avl = lia->avl;
|
||||
bplist_t *to_free = lia->to_free;
|
||||
zthr_t *t = lia->t;
|
||||
ASSERT(tx == NULL);
|
||||
|
||||
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
|
||||
return (SET_ERROR(EINTR));
|
||||
if (bp_freed) {
|
||||
livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
|
||||
KM_SLEEP);
|
||||
blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
|
||||
*temp_bp = *bp;
|
||||
node->le_bp = temp_bp;
|
||||
avl_add(avl, node);
|
||||
} else {
|
||||
livelist_entry_t node;
|
||||
node.le_bp = bp;
|
||||
livelist_entry_t *found = avl_find(avl, &node, NULL);
|
||||
if (found != NULL) {
|
||||
avl_remove(avl, found);
|
||||
kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
|
||||
kmem_free(found, sizeof (livelist_entry_t));
|
||||
} else {
|
||||
bplist_append(to_free, bp);
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
|
||||
* which have an ALLOC entry but no matching FREE
|
||||
*/
|
||||
int
|
||||
dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
|
||||
uint64_t *size)
|
||||
{
|
||||
avl_tree_t avl;
|
||||
avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
|
||||
offsetof(livelist_entry_t, le_node));
|
||||
|
||||
/* process the sublist */
|
||||
struct livelist_iter_arg arg = {
|
||||
.avl = &avl,
|
||||
.to_free = to_free,
|
||||
.t = t
|
||||
};
|
||||
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
|
||||
|
||||
avl_destroy(&avl);
|
||||
return (err);
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
/* CSTYLED */
|
||||
module_param(zfs_livelist_max_entries, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_max_entries,
|
||||
"Size to start the next sub-livelist in a livelist");
|
||||
|
||||
module_param(zfs_livelist_min_percent_shared, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_min_percent_shared,
|
||||
"Threshold at which livelist is disabled");
|
||||
#endif
|
||||
|
@ -45,6 +45,9 @@
|
||||
#include <sys/dmu_impl.h>
|
||||
#include <sys/zvol.h>
|
||||
#include <sys/zcp.h>
|
||||
#include <sys/dsl_deadlist.h>
|
||||
#include <sys/zthr.h>
|
||||
#include <sys/spa_impl.h>
|
||||
|
||||
int
|
||||
dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
|
||||
@ -120,7 +123,7 @@ struct process_old_arg {
|
||||
};
|
||||
|
||||
static int
|
||||
process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
struct process_old_arg *poa = arg;
|
||||
dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
|
||||
@ -128,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
ASSERT(!BP_IS_HOLE(bp));
|
||||
|
||||
if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
|
||||
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
|
||||
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
|
||||
if (poa->ds_prev && !poa->after_branch_point &&
|
||||
bp->blk_birth >
|
||||
dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
|
||||
@ -852,6 +855,127 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
|
||||
dmu_object_free_zapified(mos, ddobj, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
dsl_clone_destroy_assert(dsl_dir_t *dd)
|
||||
{
|
||||
uint64_t used, comp, uncomp;
|
||||
|
||||
ASSERT(dsl_dir_is_clone(dd));
|
||||
dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
|
||||
|
||||
ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used);
|
||||
ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp);
|
||||
/*
|
||||
* Greater than because we do not track embedded block pointers in
|
||||
* the livelist
|
||||
*/
|
||||
ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp);
|
||||
|
||||
ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list));
|
||||
ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list));
|
||||
}
|
||||
|
||||
/*
|
||||
* Start the delete process for a clone. Free its zil, verify the space usage
|
||||
* and queue the blkptrs for deletion by adding the livelist to the pool-wide
|
||||
* delete queue.
|
||||
*/
|
||||
static void
|
||||
dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
{
|
||||
uint64_t zap_obj, to_delete, used, comp, uncomp;
|
||||
objset_t *os;
|
||||
dsl_dir_t *dd = ds->ds_dir;
|
||||
dsl_pool_t *dp = dmu_tx_pool(tx);
|
||||
objset_t *mos = dp->dp_meta_objset;
|
||||
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
||||
VERIFY0(dmu_objset_from_ds(ds, &os));
|
||||
|
||||
/* Check that the clone is in a correct state to be deleted */
|
||||
dsl_clone_destroy_assert(dd);
|
||||
|
||||
/* Destroy the zil */
|
||||
zil_destroy_sync(dmu_objset_zil(os), tx);
|
||||
|
||||
VERIFY0(zap_lookup(mos, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete));
|
||||
/* Initialize deleted_clones entry to track livelists to cleanup */
|
||||
int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
|
||||
if (error == ENOENT) {
|
||||
zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA,
|
||||
DMU_OT_NONE, 0, tx);
|
||||
VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1,
|
||||
&(zap_obj), tx));
|
||||
spa->spa_livelists_to_delete = zap_obj;
|
||||
} else if (error != 0) {
|
||||
zfs_panic_recover("zfs: error %d was returned while looking "
|
||||
"up DMU_POOL_DELETED_CLONES in the zap");
|
||||
return;
|
||||
}
|
||||
VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
|
||||
|
||||
/* Clone is no longer using space, now tracked by dp_free_dir */
|
||||
dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
|
||||
dsl_dir_diduse_space(dd, DD_USED_HEAD,
|
||||
-used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes,
|
||||
tx);
|
||||
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
|
||||
used, comp, uncomp, tx);
|
||||
dsl_dir_remove_livelist(dd, tx, B_FALSE);
|
||||
zthr_wakeup(spa->spa_livelist_delete_zthr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Move the bptree into the pool's list of trees to clean up, update space
|
||||
* accounting information and destroy the zil.
|
||||
*/
|
||||
void
|
||||
dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
{
|
||||
uint64_t used, comp, uncomp;
|
||||
objset_t *os;
|
||||
|
||||
VERIFY0(dmu_objset_from_ds(ds, &os));
|
||||
dsl_pool_t *dp = dmu_tx_pool(tx);
|
||||
objset_t *mos = dp->dp_meta_objset;
|
||||
|
||||
zil_destroy_sync(dmu_objset_zil(os), tx);
|
||||
|
||||
if (!spa_feature_is_active(dp->dp_spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
dsl_scan_t *scn = dp->dp_scan;
|
||||
spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
|
||||
tx);
|
||||
dp->dp_bptree_obj = bptree_alloc(mos, tx);
|
||||
VERIFY0(zap_add(mos,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
|
||||
&dp->dp_bptree_obj, tx));
|
||||
ASSERT(!scn->scn_async_destroying);
|
||||
scn->scn_async_destroying = B_TRUE;
|
||||
}
|
||||
|
||||
used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
|
||||
comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
|
||||
uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
|
||||
|
||||
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
|
||||
dsl_dataset_phys(ds)->ds_unique_bytes == used);
|
||||
|
||||
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
|
||||
bptree_add(mos, dp->dp_bptree_obj,
|
||||
&dsl_dataset_phys(ds)->ds_bp,
|
||||
dsl_dataset_phys(ds)->ds_prev_snap_txg,
|
||||
used, comp, uncomp, tx);
|
||||
rrw_exit(&ds->ds_bp_rwlock, FTAG);
|
||||
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
|
||||
-used, -comp, -uncomp, tx);
|
||||
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
|
||||
used, comp, uncomp, tx);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
{
|
||||
@ -911,7 +1035,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
/*
|
||||
* Destroy the deadlist. Unless it's a clone, the
|
||||
* Destroy the deadlist. Unless it's a clone, the
|
||||
* deadlist should be empty since the dataset has no snapshots.
|
||||
* (If it's a clone, it's safe to ignore the deadlist contents
|
||||
* since they are still referenced by the origin snapshot.)
|
||||
@ -924,51 +1048,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
if (dsl_dataset_remap_deadlist_exists(ds))
|
||||
dsl_dataset_destroy_remap_deadlist(ds, tx);
|
||||
|
||||
objset_t *os;
|
||||
VERIFY0(dmu_objset_from_ds(ds, &os));
|
||||
|
||||
if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
old_synchronous_dataset_destroy(ds, tx);
|
||||
/*
|
||||
* Each destroy is responsible for both destroying (enqueuing
|
||||
* to be destroyed) the blkptrs comprising the dataset as well as
|
||||
* those belonging to the zil.
|
||||
*/
|
||||
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
|
||||
dsl_async_clone_destroy(ds, tx);
|
||||
} else if (spa_feature_is_enabled(dp->dp_spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
dsl_async_dataset_destroy(ds, tx);
|
||||
} else {
|
||||
/*
|
||||
* Move the bptree into the pool's list of trees to
|
||||
* clean up and update space accounting information.
|
||||
*/
|
||||
uint64_t used, comp, uncomp;
|
||||
|
||||
zil_destroy_sync(dmu_objset_zil(os), tx);
|
||||
|
||||
if (!spa_feature_is_active(dp->dp_spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
dsl_scan_t *scn = dp->dp_scan;
|
||||
spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
|
||||
tx);
|
||||
dp->dp_bptree_obj = bptree_alloc(mos, tx);
|
||||
VERIFY0(zap_add(mos,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
|
||||
&dp->dp_bptree_obj, tx));
|
||||
ASSERT(!scn->scn_async_destroying);
|
||||
scn->scn_async_destroying = B_TRUE;
|
||||
}
|
||||
|
||||
used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
|
||||
comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
|
||||
uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
|
||||
|
||||
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
|
||||
dsl_dataset_phys(ds)->ds_unique_bytes == used);
|
||||
|
||||
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
|
||||
bptree_add(mos, dp->dp_bptree_obj,
|
||||
&dsl_dataset_phys(ds)->ds_bp,
|
||||
dsl_dataset_phys(ds)->ds_prev_snap_txg,
|
||||
used, comp, uncomp, tx);
|
||||
rrw_exit(&ds->ds_bp_rwlock, FTAG);
|
||||
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
|
||||
-used, -comp, -uncomp, tx);
|
||||
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
|
||||
used, comp, uncomp, tx);
|
||||
old_synchronous_dataset_destroy(ds, tx);
|
||||
}
|
||||
|
||||
if (ds->ds_prev != NULL) {
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 Martin Matuska. All rights reserved.
|
||||
* Copyright (c) 2014 Joyent, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
@ -48,6 +48,7 @@
|
||||
#include <sys/policy.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zvol.h>
|
||||
#include <sys/zthr.h>
|
||||
#include "zfs_namecheck.h"
|
||||
#include "zfs_prop.h"
|
||||
|
||||
@ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu)
|
||||
|
||||
spa_async_close(dd->dd_pool->dp_spa, dd);
|
||||
|
||||
if (dsl_deadlist_is_open(&dd->dd_livelist))
|
||||
dsl_dir_livelist_close(dd);
|
||||
|
||||
dsl_prop_fini(dd);
|
||||
mutex_destroy(&dd->dd_lock);
|
||||
kmem_free(dd, sizeof (dsl_dir_t));
|
||||
@ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
|
||||
dd->dd_origin_txg =
|
||||
origin_phys->ds_creation_txg;
|
||||
dmu_buf_rele(origin_bonus, FTAG);
|
||||
if (dsl_dir_is_zapified(dd)) {
|
||||
uint64_t obj;
|
||||
err = zap_lookup(dp->dp_meta_objset,
|
||||
dd->dd_object, DD_FIELD_LIVELIST,
|
||||
sizeof (uint64_t), 1, &obj);
|
||||
if (err == 0)
|
||||
dsl_dir_livelist_open(dd, obj);
|
||||
else if (err != ENOENT)
|
||||
goto errout;
|
||||
}
|
||||
}
|
||||
|
||||
dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
|
||||
@ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
|
||||
if (winner != NULL) {
|
||||
if (dd->dd_parent)
|
||||
dsl_dir_rele(dd->dd_parent, dd);
|
||||
if (dsl_deadlist_is_open(&dd->dd_livelist))
|
||||
dsl_dir_livelist_close(dd);
|
||||
dsl_prop_fini(dd);
|
||||
mutex_destroy(&dd->dd_lock);
|
||||
kmem_free(dd, sizeof (dsl_dir_t));
|
||||
@ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
|
||||
errout:
|
||||
if (dd->dd_parent)
|
||||
dsl_dir_rele(dd->dd_parent, dd);
|
||||
if (dsl_deadlist_is_open(&dd->dd_livelist))
|
||||
dsl_dir_livelist_close(dd);
|
||||
dsl_prop_fini(dd);
|
||||
mutex_destroy(&dd->dd_lock);
|
||||
kmem_free(dd, sizeof (dsl_dir_t));
|
||||
@ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd)
|
||||
return (doi.doi_type == DMU_OTN_ZAP_METADATA);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
|
||||
{
|
||||
objset_t *mos = dd->dd_pool->dp_meta_objset;
|
||||
ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
|
||||
SPA_FEATURE_LIVELIST));
|
||||
dsl_deadlist_open(&dd->dd_livelist, mos, obj);
|
||||
bplist_create(&dd->dd_pending_allocs);
|
||||
bplist_create(&dd->dd_pending_frees);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_dir_livelist_close(dsl_dir_t *dd)
|
||||
{
|
||||
dsl_deadlist_close(&dd->dd_livelist);
|
||||
bplist_destroy(&dd->dd_pending_allocs);
|
||||
bplist_destroy(&dd->dd_pending_frees);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
|
||||
{
|
||||
uint64_t obj;
|
||||
dsl_pool_t *dp = dmu_tx_pool(tx);
|
||||
spa_t *spa = dp->dp_spa;
|
||||
livelist_condense_entry_t to_condense = spa->spa_to_condense;
|
||||
|
||||
if (!dsl_deadlist_is_open(&dd->dd_livelist))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the livelist being removed is set to be condensed, stop the
|
||||
* condense zthr and indicate the cancellation in the spa_to_condense
|
||||
* struct in case the condense no-wait synctask has already started
|
||||
*/
|
||||
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
|
||||
if (ll_condense_thread != NULL &&
|
||||
(to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
|
||||
/*
|
||||
* We use zthr_wait_cycle_done instead of zthr_cancel
|
||||
* because we don't want to destroy the zthr, just have
|
||||
* it skip its current task.
|
||||
*/
|
||||
spa->spa_to_condense.cancelled = B_TRUE;
|
||||
zthr_wait_cycle_done(ll_condense_thread);
|
||||
/*
|
||||
* If we've returned from zthr_wait_cycle_done without
|
||||
* clearing the to_condense data structure it's either
|
||||
* because the no-wait synctask has started (which is
|
||||
* indicated by 'syncing' field of to_condense) and we
|
||||
* can expect it to clear to_condense on its own.
|
||||
* Otherwise, we returned before the zthr ran. The
|
||||
* checkfunc will now fail as cancelled == B_TRUE so we
|
||||
* can safely NULL out ds, allowing a different dir's
|
||||
* livelist to be condensed.
|
||||
*
|
||||
* We can be sure that the to_condense struct will not
|
||||
* be repopulated at this stage because both this
|
||||
* function and dsl_livelist_try_condense execute in
|
||||
* syncing context.
|
||||
*/
|
||||
if ((spa->spa_to_condense.ds != NULL) &&
|
||||
!spa->spa_to_condense.syncing) {
|
||||
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
|
||||
spa);
|
||||
spa->spa_to_condense.ds = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
dsl_dir_livelist_close(dd);
|
||||
int err = zap_lookup(dp->dp_meta_objset, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj);
|
||||
if (err == 0) {
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, tx));
|
||||
if (total) {
|
||||
dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
|
||||
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
|
||||
}
|
||||
} else {
|
||||
ASSERT3U(err, !=, ENOENT);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
EXPORT_SYMBOL(dsl_dir_set_quota);
|
||||
EXPORT_SYMBOL(dsl_dir_set_reservation);
|
||||
|
@ -721,7 +721,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
|
||||
* Now that the datasets have been completely synced, we can
|
||||
* clean up our in-memory structures accumulated while syncing:
|
||||
*
|
||||
* - move dead blocks from the pending deadlist to the on-disk deadlist
|
||||
* - move dead blocks from the pending deadlist and livelists
|
||||
* to the on-disk versions
|
||||
* - release hold from dsl_dataset_dirty()
|
||||
* - release key mapping hold from dsl_dataset_dirty()
|
||||
*/
|
||||
|
@ -3103,8 +3103,18 @@ dsl_scan_update_stats(dsl_scan_t *scn)
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(!bp_freed);
|
||||
return (dsl_scan_free_block_cb(arg, bp, tx));
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(!bp_freed);
|
||||
dsl_scan_t *scn = arg;
|
||||
const dva_t *dva = &bp->blk_dva[0];
|
||||
|
||||
@ -3123,6 +3133,7 @@ dsl_scan_active(dsl_scan_t *scn)
|
||||
{
|
||||
spa_t *spa = scn->scn_dp->dp_spa;
|
||||
uint64_t used = 0, comp, uncomp;
|
||||
boolean_t clones_left;
|
||||
|
||||
if (spa->spa_load_state != SPA_LOAD_NONE)
|
||||
return (B_FALSE);
|
||||
@ -3136,7 +3147,8 @@ dsl_scan_active(dsl_scan_t *scn)
|
||||
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
|
||||
&used, &comp, &uncomp);
|
||||
}
|
||||
return (used != 0);
|
||||
clones_left = spa_livelist_delete_check(spa);
|
||||
return ((used != 0) || (clones_left));
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
@ -3233,7 +3245,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
scn->scn_zio_root = zio_root(spa, NULL,
|
||||
NULL, ZIO_FLAG_MUSTSUCCEED);
|
||||
err = bpobj_iterate(&dp->dp_free_bpobj,
|
||||
dsl_scan_free_block_cb, scn, tx);
|
||||
bpobj_dsl_scan_free_block_cb, scn, tx);
|
||||
VERIFY0(zio_wait(scn->scn_zio_root));
|
||||
scn->scn_zio_root = NULL;
|
||||
|
||||
@ -3330,7 +3342,8 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
-dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
|
||||
}
|
||||
|
||||
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
|
||||
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
|
||||
!spa_livelist_delete_check(spa)) {
|
||||
/* finished; verify that space accounting went to zero */
|
||||
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
|
||||
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
|
||||
|
499
module/zfs/spa.c
499
module/zfs/spa.c
@ -232,6 +232,27 @@ uint64_t zfs_max_missing_tvds_scan = 0;
|
||||
*/
|
||||
boolean_t zfs_pause_spa_sync = B_FALSE;
|
||||
|
||||
/*
|
||||
* Variables to indicate the livelist condense zthr func should wait at certain
|
||||
* points for the livelist to be removed - used to test condense/destroy races
|
||||
*/
|
||||
int zfs_livelist_condense_zthr_pause = 0;
|
||||
int zfs_livelist_condense_sync_pause = 0;
|
||||
|
||||
/*
|
||||
* Variables to track whether or not condense cancellation has been
|
||||
* triggered in testing.
|
||||
*/
|
||||
int zfs_livelist_condense_sync_cancel = 0;
|
||||
int zfs_livelist_condense_zthr_cancel = 0;
|
||||
|
||||
/*
|
||||
* Variable to track whether or not extra ALLOC blkptrs were added to a
|
||||
* livelist entry while it was being condensed (caused by the way we track
|
||||
* remapped blkptrs in dbuf_remap_impl)
|
||||
*/
|
||||
int zfs_livelist_condense_new_alloc = 0;
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* SPA properties routines
|
||||
@ -1481,6 +1502,27 @@ spa_unload_log_sm_metadata(spa_t *spa)
|
||||
spa->spa_unflushed_stats.sus_blocklimit = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
spa_destroy_aux_threads(spa_t *spa)
|
||||
{
|
||||
if (spa->spa_condense_zthr != NULL) {
|
||||
zthr_destroy(spa->spa_condense_zthr);
|
||||
spa->spa_condense_zthr = NULL;
|
||||
}
|
||||
if (spa->spa_checkpoint_discard_zthr != NULL) {
|
||||
zthr_destroy(spa->spa_checkpoint_discard_zthr);
|
||||
spa->spa_checkpoint_discard_zthr = NULL;
|
||||
}
|
||||
if (spa->spa_livelist_delete_zthr != NULL) {
|
||||
zthr_destroy(spa->spa_livelist_delete_zthr);
|
||||
spa->spa_livelist_delete_zthr = NULL;
|
||||
}
|
||||
if (spa->spa_livelist_condense_zthr != NULL) {
|
||||
zthr_destroy(spa->spa_livelist_condense_zthr);
|
||||
spa->spa_livelist_condense_zthr = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Opposite of spa_load().
|
||||
*/
|
||||
@ -1552,15 +1594,7 @@ spa_unload(spa_t *spa)
|
||||
spa->spa_vdev_removal = NULL;
|
||||
}
|
||||
|
||||
if (spa->spa_condense_zthr != NULL) {
|
||||
zthr_destroy(spa->spa_condense_zthr);
|
||||
spa->spa_condense_zthr = NULL;
|
||||
}
|
||||
|
||||
if (spa->spa_checkpoint_discard_zthr != NULL) {
|
||||
zthr_destroy(spa->spa_checkpoint_discard_zthr);
|
||||
spa->spa_checkpoint_discard_zthr = NULL;
|
||||
}
|
||||
spa_destroy_aux_threads(spa);
|
||||
|
||||
spa_condense_fini(spa);
|
||||
|
||||
@ -2335,6 +2369,376 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
|
||||
return (SET_ERROR(err));
|
||||
}
|
||||
|
||||
boolean_t
|
||||
spa_livelist_delete_check(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_livelists_to_delete != 0);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static boolean_t
|
||||
spa_livelist_delete_cb_check(void *arg, zthr_t *z)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
return (spa_livelist_delete_check(spa));
|
||||
}
|
||||
|
||||
static int
|
||||
delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
zio_free(spa, tx->tx_txg, bp);
|
||||
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
|
||||
-bp_get_dsize_sync(spa, bp),
|
||||
-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
|
||||
{
|
||||
int err;
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
zap_cursor_init(&zc, os, zap_obj);
|
||||
err = zap_cursor_retrieve(&zc, &za);
|
||||
zap_cursor_fini(&zc);
|
||||
if (err == 0)
|
||||
*llp = za.za_first_integer;
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* Components of livelist deletion that must be performed in syncing
|
||||
* context: freeing block pointers and updating the pool-wide data
|
||||
* structures to indicate how much work is left to do
|
||||
*/
|
||||
typedef struct sublist_delete_arg {
|
||||
spa_t *spa;
|
||||
dsl_deadlist_t *ll;
|
||||
uint64_t key;
|
||||
bplist_t *to_free;
|
||||
} sublist_delete_arg_t;
|
||||
|
||||
static void
|
||||
sublist_delete_sync(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
sublist_delete_arg_t *sda = arg;
|
||||
spa_t *spa = sda->spa;
|
||||
dsl_deadlist_t *ll = sda->ll;
|
||||
uint64_t key = sda->key;
|
||||
bplist_t *to_free = sda->to_free;
|
||||
|
||||
bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
|
||||
dsl_deadlist_remove_entry(ll, key, tx);
|
||||
}
|
||||
|
||||
typedef struct livelist_delete_arg {
|
||||
spa_t *spa;
|
||||
uint64_t ll_obj;
|
||||
uint64_t zap_obj;
|
||||
} livelist_delete_arg_t;
|
||||
|
||||
static void
|
||||
livelist_delete_sync(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
livelist_delete_arg_t *lda = arg;
|
||||
spa_t *spa = lda->spa;
|
||||
uint64_t ll_obj = lda->ll_obj;
|
||||
uint64_t zap_obj = lda->zap_obj;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
uint64_t count;
|
||||
|
||||
/* free the livelist and decrement the feature count */
|
||||
VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
|
||||
dsl_deadlist_free(mos, ll_obj, tx);
|
||||
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
|
||||
VERIFY0(zap_count(mos, zap_obj, &count));
|
||||
if (count == 0) {
|
||||
/* no more livelists to delete */
|
||||
VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_DELETED_CLONES, tx));
|
||||
VERIFY0(zap_destroy(mos, zap_obj, tx));
|
||||
spa->spa_livelists_to_delete = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Load in the value for the livelist to be removed and open it. Then,
|
||||
* load its first sublist and determine which block pointers should actually
|
||||
* be freed. Then, call a synctask which performs the actual frees and updates
|
||||
* the pool-wide livelist data.
|
||||
*/
|
||||
/* ARGSUSED */
|
||||
void
|
||||
spa_livelist_delete_cb(void *arg, zthr_t *z)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
uint64_t ll_obj = 0, count;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
uint64_t zap_obj = spa->spa_livelists_to_delete;
|
||||
/*
|
||||
* Determine the next livelist to delete. This function should only
|
||||
* be called if there is at least one deleted clone.
|
||||
*/
|
||||
VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
|
||||
VERIFY0(zap_count(mos, ll_obj, &count));
|
||||
if (count > 0) {
|
||||
dsl_deadlist_t ll = { 0 };
|
||||
dsl_deadlist_entry_t *dle;
|
||||
bplist_t to_free;
|
||||
dsl_deadlist_open(&ll, mos, ll_obj);
|
||||
dle = dsl_deadlist_first(&ll);
|
||||
ASSERT3P(dle, !=, NULL);
|
||||
bplist_create(&to_free);
|
||||
int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
|
||||
z, NULL);
|
||||
if (err == 0) {
|
||||
sublist_delete_arg_t sync_arg = {
|
||||
.spa = spa,
|
||||
.ll = &ll,
|
||||
.key = dle->dle_mintxg,
|
||||
.to_free = &to_free
|
||||
};
|
||||
zfs_dbgmsg("deleting sublist (id %llu) from"
|
||||
" livelist %llu, %d remaining",
|
||||
dle->dle_bpobj.bpo_object, ll_obj, count - 1);
|
||||
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
|
||||
sublist_delete_sync, &sync_arg, 0,
|
||||
ZFS_SPACE_CHECK_DESTROY));
|
||||
} else {
|
||||
ASSERT(err == EINTR);
|
||||
}
|
||||
bplist_clear(&to_free);
|
||||
bplist_destroy(&to_free);
|
||||
dsl_deadlist_close(&ll);
|
||||
} else {
|
||||
livelist_delete_arg_t sync_arg = {
|
||||
.spa = spa,
|
||||
.ll_obj = ll_obj,
|
||||
.zap_obj = zap_obj
|
||||
};
|
||||
zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
|
||||
VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
|
||||
&sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
spa_start_livelist_destroy_thread(spa_t *spa)
|
||||
{
|
||||
ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
|
||||
spa->spa_livelist_delete_zthr = zthr_create(
|
||||
spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
|
||||
}
|
||||
|
||||
typedef struct livelist_new_arg {
|
||||
bplist_t *allocs;
|
||||
bplist_t *frees;
|
||||
} livelist_new_arg_t;
|
||||
|
||||
static int
|
||||
livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(tx == NULL);
|
||||
livelist_new_arg_t *lna = arg;
|
||||
if (bp_freed) {
|
||||
bplist_append(lna->frees, bp);
|
||||
} else {
|
||||
bplist_append(lna->allocs, bp);
|
||||
zfs_livelist_condense_new_alloc++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
typedef struct livelist_condense_arg {
|
||||
spa_t *spa;
|
||||
bplist_t to_keep;
|
||||
uint64_t first_size;
|
||||
uint64_t next_size;
|
||||
} livelist_condense_arg_t;
|
||||
|
||||
static void
|
||||
spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
livelist_condense_arg_t *lca = arg;
|
||||
spa_t *spa = lca->spa;
|
||||
bplist_t new_frees;
|
||||
dsl_dataset_t *ds = spa->spa_to_condense.ds;
|
||||
|
||||
/* Have we been cancelled? */
|
||||
if (spa->spa_to_condense.cancelled) {
|
||||
zfs_livelist_condense_sync_cancel++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
|
||||
dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
|
||||
dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
|
||||
|
||||
/*
|
||||
* It's possible that the livelist was changed while the zthr was
|
||||
* running. Therefore, we need to check for new blkptrs in the two
|
||||
* entries being condensed and continue to track them in the livelist.
|
||||
* Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
|
||||
* it's possible that the newly added blkptrs are FREEs or ALLOCs so
|
||||
* we need to sort them into two different bplists.
|
||||
*/
|
||||
uint64_t first_obj = first->dle_bpobj.bpo_object;
|
||||
uint64_t next_obj = next->dle_bpobj.bpo_object;
|
||||
uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
|
||||
uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
|
||||
|
||||
bplist_create(&new_frees);
|
||||
livelist_new_arg_t new_bps = {
|
||||
.allocs = &lca->to_keep,
|
||||
.frees = &new_frees,
|
||||
};
|
||||
|
||||
if (cur_first_size > lca->first_size) {
|
||||
VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
|
||||
livelist_track_new_cb, &new_bps, lca->first_size));
|
||||
}
|
||||
if (cur_next_size > lca->next_size) {
|
||||
VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
|
||||
livelist_track_new_cb, &new_bps, lca->next_size));
|
||||
}
|
||||
|
||||
dsl_deadlist_clear_entry(first, ll, tx);
|
||||
ASSERT(bpobj_is_empty(&first->dle_bpobj));
|
||||
dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
|
||||
|
||||
bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
|
||||
bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
|
||||
bplist_destroy(&new_frees);
|
||||
|
||||
char dsname[ZFS_MAX_DATASET_NAME_LEN];
|
||||
dsl_dataset_name(ds, dsname);
|
||||
zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
|
||||
"(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
|
||||
"(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
|
||||
cur_first_size, next_obj, cur_next_size,
|
||||
first->dle_bpobj.bpo_object,
|
||||
first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
|
||||
out:
|
||||
dmu_buf_rele(ds->ds_dbuf, spa);
|
||||
spa->spa_to_condense.ds = NULL;
|
||||
bplist_clear(&lca->to_keep);
|
||||
bplist_destroy(&lca->to_keep);
|
||||
kmem_free(lca, sizeof (livelist_condense_arg_t));
|
||||
spa->spa_to_condense.syncing = B_FALSE;
|
||||
}
|
||||
|
||||
void
|
||||
spa_livelist_condense_cb(void *arg, zthr_t *t)
|
||||
{
|
||||
while (zfs_livelist_condense_zthr_pause &&
|
||||
!(zthr_has_waiters(t) || zthr_iscancelled(t)))
|
||||
delay(1);
|
||||
|
||||
spa_t *spa = arg;
|
||||
dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
|
||||
dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
|
||||
uint64_t first_size, next_size;
|
||||
|
||||
livelist_condense_arg_t *lca =
|
||||
kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
|
||||
bplist_create(&lca->to_keep);
|
||||
|
||||
/*
|
||||
* Process the livelists (matching FREEs and ALLOCs) in open context
|
||||
* so we have minimal work in syncing context to condense.
|
||||
*
|
||||
* We save bpobj sizes (first_size and next_size) to use later in
|
||||
* syncing context to determine if entries were added to these sublists
|
||||
* while in open context. This is possible because the clone is still
|
||||
* active and open for normal writes and we want to make sure the new,
|
||||
* unprocessed blockpointers are inserted into the livelist normally.
|
||||
*
|
||||
* Note that dsl_process_sub_livelist() both stores the size number of
|
||||
* blockpointers and iterates over them while the bpobj's lock held, so
|
||||
* the sizes returned to us are consistent which what was actually
|
||||
* processed.
|
||||
*/
|
||||
int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
|
||||
&first_size);
|
||||
if (err == 0)
|
||||
err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
|
||||
t, &next_size);
|
||||
|
||||
if (err == 0) {
|
||||
while (zfs_livelist_condense_sync_pause &&
|
||||
!(zthr_has_waiters(t) || zthr_iscancelled(t)))
|
||||
delay(1);
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
|
||||
dmu_tx_mark_netfree(tx);
|
||||
dmu_tx_hold_space(tx, 1);
|
||||
err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
|
||||
if (err == 0) {
|
||||
/*
|
||||
* Prevent the condense zthr restarting before
|
||||
* the synctask completes.
|
||||
*/
|
||||
spa->spa_to_condense.syncing = B_TRUE;
|
||||
lca->spa = spa;
|
||||
lca->first_size = first_size;
|
||||
lca->next_size = next_size;
|
||||
dsl_sync_task_nowait(spa_get_dsl(spa),
|
||||
spa_livelist_condense_sync, lca, 0,
|
||||
ZFS_SPACE_CHECK_NONE, tx);
|
||||
dmu_tx_commit(tx);
|
||||
return;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Condensing can not continue: either it was externally stopped or
|
||||
* we were unable to assign to a tx because the pool has run out of
|
||||
* space. In the second case, we'll just end up trying to condense
|
||||
* again in a later txg.
|
||||
*/
|
||||
ASSERT(err != 0);
|
||||
bplist_clear(&lca->to_keep);
|
||||
bplist_destroy(&lca->to_keep);
|
||||
kmem_free(lca, sizeof (livelist_condense_arg_t));
|
||||
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
|
||||
spa->spa_to_condense.ds = NULL;
|
||||
if (err == EINTR)
|
||||
zfs_livelist_condense_zthr_cancel++;
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
/*
|
||||
* Check that there is something to condense but that a condense is not
|
||||
* already in progress and that condensing has not been cancelled.
|
||||
*/
|
||||
static boolean_t
|
||||
spa_livelist_condense_cb_check(void *arg, zthr_t *z)
|
||||
{
|
||||
spa_t *spa = arg;
|
||||
if ((spa->spa_to_condense.ds != NULL) &&
|
||||
(spa->spa_to_condense.syncing == B_FALSE) &&
|
||||
(spa->spa_to_condense.cancelled == B_FALSE)) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
spa_start_livelist_condensing_thread(spa_t *spa)
|
||||
{
|
||||
spa->spa_to_condense.ds = NULL;
|
||||
spa->spa_to_condense.first = NULL;
|
||||
spa->spa_to_condense.next = NULL;
|
||||
spa->spa_to_condense.syncing = B_FALSE;
|
||||
spa->spa_to_condense.cancelled = B_FALSE;
|
||||
|
||||
ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
|
||||
spa->spa_livelist_condense_zthr = zthr_create(
|
||||
spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa);
|
||||
}
|
||||
|
||||
static void
|
||||
spa_spawn_aux_threads(spa_t *spa)
|
||||
{
|
||||
@ -2343,6 +2747,8 @@ spa_spawn_aux_threads(spa_t *spa)
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
spa_start_indirect_condensing_thread(spa);
|
||||
spa_start_livelist_destroy_thread(spa);
|
||||
spa_start_livelist_condensing_thread(spa);
|
||||
|
||||
ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
|
||||
spa->spa_checkpoint_discard_zthr =
|
||||
@ -3603,6 +4009,15 @@ spa_ld_get_props(spa_t *spa)
|
||||
if (error != 0 && error != ENOENT)
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
|
||||
|
||||
/*
|
||||
* Load the livelist deletion field. If a livelist is queued for
|
||||
* deletion, indicate that in the spa
|
||||
*/
|
||||
error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
|
||||
&spa->spa_livelists_to_delete, B_FALSE);
|
||||
if (error != 0 && error != ENOENT)
|
||||
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
|
||||
|
||||
/*
|
||||
* Load the history object. If we have an older pool, this
|
||||
* will not be present.
|
||||
@ -7571,6 +7986,14 @@ spa_async_suspend(spa_t *spa)
|
||||
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
|
||||
if (discard_thread != NULL)
|
||||
zthr_cancel(discard_thread);
|
||||
|
||||
zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
|
||||
if (ll_delete_thread != NULL)
|
||||
zthr_cancel(ll_delete_thread);
|
||||
|
||||
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
|
||||
if (ll_condense_thread != NULL)
|
||||
zthr_cancel(ll_condense_thread);
|
||||
}
|
||||
|
||||
void
|
||||
@ -7589,6 +8012,14 @@ spa_async_resume(spa_t *spa)
|
||||
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
|
||||
if (discard_thread != NULL)
|
||||
zthr_resume(discard_thread);
|
||||
|
||||
zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
|
||||
if (ll_delete_thread != NULL)
|
||||
zthr_resume(ll_delete_thread);
|
||||
|
||||
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
|
||||
if (ll_condense_thread != NULL)
|
||||
zthr_resume(ll_condense_thread);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
@ -7639,14 +8070,28 @@ spa_async_request(spa_t *spa, int task)
|
||||
* ==========================================================================
|
||||
*/
|
||||
|
||||
|
||||
static int
|
||||
bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
bpobj_t *bpo = arg;
|
||||
bpobj_enqueue(bpo, bp, tx);
|
||||
bpobj_enqueue(bpo, bp, bp_freed, tx);
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
|
||||
}
|
||||
|
||||
int
|
||||
bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
|
||||
}
|
||||
|
||||
static int
|
||||
spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
{
|
||||
@ -7657,6 +8102,14 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(!bp_freed);
|
||||
return (spa_free_sync_cb(arg, bp, tx));
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: this simple function is not inlined to make it easier to dtrace the
|
||||
* amount of time spent syncing frees.
|
||||
@ -7693,7 +8146,7 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
|
||||
*/
|
||||
zio_t *zio = zio_root(spa, NULL, NULL, 0);
|
||||
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
|
||||
spa_free_sync_cb, zio, tx), ==, 0);
|
||||
bpobj_spa_free_sync_cb, zio, tx), ==, 0);
|
||||
VERIFY0(zio_wait(zio));
|
||||
}
|
||||
|
||||
@ -8296,7 +8749,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
|
||||
* we sync the deferred frees later in pass 1.
|
||||
*/
|
||||
ASSERT3U(pass, >, 1);
|
||||
bplist_iterate(free_bpl, bpobj_enqueue_cb,
|
||||
bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
|
||||
&spa->spa_deferred_bpobj, tx);
|
||||
}
|
||||
|
||||
@ -8884,4 +9337,24 @@ MODULE_PARM_DESC(zfs_max_missing_tvds,
|
||||
" (in read-only mode)");
|
||||
/* END CSTYLED */
|
||||
|
||||
module_param(zfs_livelist_condense_zthr_pause, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_condense_zthr_pause,
|
||||
"Set the livelist condense zthr to pause");
|
||||
module_param(zfs_livelist_condense_sync_pause, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_condense_sync_pause,
|
||||
"Set the livelist condense synctask to pause");
|
||||
|
||||
module_param(zfs_livelist_condense_sync_cancel, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_condense_sync_cancel,
|
||||
"Whether livelist condensing was canceled in the synctask");
|
||||
module_param(zfs_livelist_condense_zthr_cancel, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_condense_zthr_cancel,
|
||||
"Whether livelist condensing was canceled in the zthr function");
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
module_param(zfs_livelist_condense_new_alloc, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_livelist_condense_new_alloc,
|
||||
"Whether extra ALLOC blkptrs were added to a livelist entry while it"
|
||||
" was being condensed");
|
||||
/* END CSTYLED */
|
||||
#endif
|
||||
|
@ -21,7 +21,7 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Integros [integros.com]
|
||||
* Copyright 2017 Joyent, Inc.
|
||||
*/
|
||||
@ -413,7 +413,6 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
|
||||
|
||||
/* spa_history_log_sync will free nvl */
|
||||
return (err);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -207,12 +207,15 @@ struct zthr {
|
||||
/* flag set to true if we are canceling the zthr */
|
||||
boolean_t zthr_cancel;
|
||||
|
||||
/* flag set to true if we are waiting for the zthr to finish */
|
||||
boolean_t zthr_haswaiters;
|
||||
kcondvar_t zthr_wait_cv;
|
||||
/*
|
||||
* maximum amount of time that the zthr is spent sleeping;
|
||||
* if this is 0, the thread doesn't wake up until it gets
|
||||
* signaled.
|
||||
*/
|
||||
hrtime_t zthr_wait_time;
|
||||
hrtime_t zthr_sleep_timeout;
|
||||
|
||||
/* consumer-provided callbacks & data */
|
||||
zthr_checkfunc_t *zthr_checkfunc;
|
||||
@ -239,14 +242,18 @@ zthr_procedure(void *arg)
|
||||
* order to prevent this process from incorrectly
|
||||
* contributing to the system load average when idle.
|
||||
*/
|
||||
if (t->zthr_wait_time == 0) {
|
||||
if (t->zthr_sleep_timeout == 0) {
|
||||
cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock);
|
||||
} else {
|
||||
(void) cv_timedwait_sig_hires(&t->zthr_cv,
|
||||
&t->zthr_state_lock, t->zthr_wait_time,
|
||||
&t->zthr_state_lock, t->zthr_sleep_timeout,
|
||||
MSEC2NSEC(1), 0);
|
||||
}
|
||||
}
|
||||
if (t->zthr_haswaiters) {
|
||||
t->zthr_haswaiters = B_FALSE;
|
||||
cv_broadcast(&t->zthr_wait_cv);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -280,12 +287,13 @@ zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
|
||||
mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
mutex_enter(&t->zthr_state_lock);
|
||||
t->zthr_checkfunc = checkfunc;
|
||||
t->zthr_func = func;
|
||||
t->zthr_arg = arg;
|
||||
t->zthr_wait_time = max_sleep;
|
||||
t->zthr_sleep_timeout = max_sleep;
|
||||
|
||||
t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
|
||||
0, &p0, TS_RUN, minclsyspri);
|
||||
@ -303,6 +311,7 @@ zthr_destroy(zthr_t *t)
|
||||
mutex_destroy(&t->zthr_request_lock);
|
||||
mutex_destroy(&t->zthr_state_lock);
|
||||
cv_destroy(&t->zthr_cv);
|
||||
cv_destroy(&t->zthr_wait_cv);
|
||||
kmem_free(t, sizeof (*t));
|
||||
}
|
||||
|
||||
@ -355,9 +364,8 @@ zthr_cancel(zthr_t *t)
|
||||
*
|
||||
* [1] The thread has already been cancelled, therefore
|
||||
* there is nothing for us to do.
|
||||
* [2] The thread is sleeping, so we broadcast the CV first
|
||||
* to wake it up and then we set the flag and we are
|
||||
* waiting for it to exit.
|
||||
* [2] The thread is sleeping so we set the flag, broadcast
|
||||
* the CV and wait for it to exit.
|
||||
* [3] The thread is doing work, in which case we just set
|
||||
* the flag and wait for it to finish.
|
||||
* [4] The thread was just created/resumed, in which case
|
||||
@ -397,6 +405,7 @@ zthr_resume(zthr_t *t)
|
||||
ASSERT3P(&t->zthr_checkfunc, !=, NULL);
|
||||
ASSERT3P(&t->zthr_func, !=, NULL);
|
||||
ASSERT(!t->zthr_cancel);
|
||||
ASSERT(!t->zthr_haswaiters);
|
||||
|
||||
/*
|
||||
* There are 4 states that we find the zthr in at this point
|
||||
@ -451,3 +460,74 @@ zthr_iscancelled(zthr_t *t)
|
||||
mutex_exit(&t->zthr_state_lock);
|
||||
return (cancelled);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the zthr to finish its current function. Similar to
|
||||
* zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
|
||||
* early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was
|
||||
* sleeping or cancelled, return immediately.
|
||||
*/
|
||||
void
|
||||
zthr_wait_cycle_done(zthr_t *t)
|
||||
{
|
||||
mutex_enter(&t->zthr_state_lock);
|
||||
|
||||
/*
|
||||
* Since we are holding the zthr_state_lock at this point
|
||||
* we can find the state in one of the following 5 states:
|
||||
*
|
||||
* [1] The thread has already cancelled, therefore
|
||||
* there is nothing for us to do.
|
||||
* [2] The thread is sleeping so we set the flag, broadcast
|
||||
* the CV and wait for it to exit.
|
||||
* [3] The thread is doing work, in which case we just set
|
||||
* the flag and wait for it to finish.
|
||||
* [4] The thread was just created/resumed, in which case
|
||||
* the behavior is similar to [3].
|
||||
* [5] The thread is the middle of being cancelled, which is
|
||||
* similar to [3]. We'll wait for the cancel, which is
|
||||
* waiting for the zthr func.
|
||||
*
|
||||
* Since requests are serialized, by the time that we get
|
||||
* control back we expect that the zthr has completed it's
|
||||
* zthr_func.
|
||||
*/
|
||||
if (t->zthr_thread != NULL) {
|
||||
t->zthr_haswaiters = B_TRUE;
|
||||
|
||||
/* broadcast in case the zthr is sleeping */
|
||||
cv_broadcast(&t->zthr_cv);
|
||||
|
||||
while ((t->zthr_haswaiters) && (t->zthr_thread != NULL))
|
||||
cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock);
|
||||
|
||||
ASSERT(!t->zthr_haswaiters);
|
||||
}
|
||||
|
||||
mutex_exit(&t->zthr_state_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is intended to be used by the zthr itself
|
||||
* to check if another thread is waiting on it to finish
|
||||
*
|
||||
* returns TRUE if we have been asked to finish.
|
||||
*
|
||||
* returns FALSE otherwise.
|
||||
*/
|
||||
boolean_t
|
||||
zthr_has_waiters(zthr_t *t)
|
||||
{
|
||||
ASSERT3P(t->zthr_thread, ==, curthread);
|
||||
|
||||
mutex_enter(&t->zthr_state_lock);
|
||||
|
||||
/*
|
||||
* Similarly to zthr_iscancelled(), we only grab the
|
||||
* zthr_state_lock so that the zthr itself can use this
|
||||
* to check for the request.
|
||||
*/
|
||||
boolean_t has_waiters = t->zthr_haswaiters;
|
||||
mutex_exit(&t->zthr_state_lock);
|
||||
return (has_waiters);
|
||||
}
|
||||
|
@ -147,12 +147,15 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
|
||||
tags = ['functional', 'cli_root', 'zfs_create']
|
||||
|
||||
[tests/functional/cli_root/zfs_destroy]
|
||||
tests = ['zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
|
||||
tests = ['zfs_clone_livelist_condense_and_disable',
|
||||
'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
|
||||
'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
|
||||
'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
|
||||
'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
|
||||
'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
|
||||
'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos',
|
||||
'zfs_destroy_016_pos']
|
||||
'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist',
|
||||
'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense']
|
||||
tags = ['functional', 'cli_root', 'zfs_destroy']
|
||||
|
||||
[tests/functional/cli_root/zfs_diff]
|
||||
|
@ -22,7 +22,7 @@
|
||||
#
|
||||
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
# Use is subject to license terms.
|
||||
# Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
||||
# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
# Copyright (c) 2017 by Tim Chase. All rights reserved.
|
||||
# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
|
||||
|
@ -2,6 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_destro
|
||||
dist_pkgdata_SCRIPTS = \
|
||||
setup.ksh \
|
||||
cleanup.ksh \
|
||||
zfs_clone_livelist_condense_and_disable.ksh \
|
||||
zfs_clone_livelist_condense_races.ksh \
|
||||
zfs_destroy_001_pos.ksh \
|
||||
zfs_destroy_002_pos.ksh \
|
||||
zfs_destroy_003_pos.ksh \
|
||||
@ -17,7 +19,10 @@ dist_pkgdata_SCRIPTS = \
|
||||
zfs_destroy_013_neg.ksh \
|
||||
zfs_destroy_014_pos.ksh \
|
||||
zfs_destroy_015_pos.ksh \
|
||||
zfs_destroy_016_pos.ksh
|
||||
zfs_destroy_016_pos.ksh \
|
||||
zfs_destroy_clone_livelist.ksh \
|
||||
zfs_destroy_dev_removal.ksh \
|
||||
zfs_destroy_dev_removal_condense.ksh
|
||||
|
||||
dist_pkgdata_DATA = \
|
||||
zfs_destroy_common.kshlib \
|
||||
|
@ -0,0 +1,125 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
# DESCRIPTION
|
||||
# Verify zfs destroy test for clones with the livelist feature
|
||||
# enabled.
|
||||
|
||||
# STRATEGY
|
||||
# 1. Clone where livelist is condensed
|
||||
# - create clone, write several files, delete those files
|
||||
# - check that the number of livelist entries decreases
|
||||
# after the delete
|
||||
# 2. Clone where livelist is deactivated
|
||||
# - create clone, write files. Delete those files and the
|
||||
# file in the filesystem when the snapshot was created
|
||||
# so the clone and snapshot no longer share data
|
||||
# - check that the livelist is destroyed
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
|
||||
# reset the livelist sublist size to the original value
|
||||
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
|
||||
# reset the minimum percent shared to 75
|
||||
set_tunable32 zfs_livelist_min_percent_shared $ORIGINAL_MIN
|
||||
}
|
||||
|
||||
function check_ll_len
|
||||
{
|
||||
string="$(zdb -vvvvv $TESTPOOL | grep "Livelist")"
|
||||
substring="$1"
|
||||
msg=$2
|
||||
if test "${string#*$substring}" != "$string"; then
|
||||
return 0 # $substring is in $string
|
||||
else
|
||||
log_note $string
|
||||
log_fail "$msg" # $substring is not in $string
|
||||
fi
|
||||
}
|
||||
|
||||
function test_condense
|
||||
{
|
||||
# set the max livelist entries to a small value to more easily
|
||||
# trigger a condense
|
||||
set_tunable64 zfs_livelist_max_entries 0x14
|
||||
# set a small percent shared threshold so the livelist is not disabled
|
||||
set_tunable32 zfs_livelist_min_percent_shared 0xa
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
# sync between each write to make sure a new entry is created
|
||||
for i in {0..4}; do
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i
|
||||
log_must zpool sync $TESTPOOL
|
||||
done
|
||||
|
||||
check_ll_len "5 entries" "Unexpected livelist size"
|
||||
|
||||
# sync between each write to allow for a condense of the previous entry
|
||||
for i in {0..4}; do
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i
|
||||
log_must zpool sync $TESTPOOL
|
||||
done
|
||||
|
||||
check_ll_len "6 entries" "Condense did not occur"
|
||||
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
check_livelist_gone
|
||||
}
|
||||
|
||||
function test_deactivated
|
||||
{
|
||||
# Threshold set to 50 percent
|
||||
set_tunable32 zfs_livelist_min_percent_shared 0x32
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1
|
||||
log_must zpool sync $TESTPOOL
|
||||
# snapshot and clone share 'atestfile', 33 percent
|
||||
check_livelist_gone
|
||||
log_must zfs destroy -R $TESTPOOL/$TESTCLONE
|
||||
|
||||
# Threshold set to 20 percent
|
||||
set_tunable32 zfs_livelist_min_percent_shared 0x14
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE2
|
||||
log_must zpool sync $TESTPOOL
|
||||
# snapshot and clone share 'atestfile', 25 percent
|
||||
check_livelist_exists $TESTCLONE
|
||||
log_must rm /$TESTPOOL/$TESTCLONE/atestfile
|
||||
# snapshot and clone share no files
|
||||
check_livelist_gone
|
||||
log_must zfs destroy -R $TESTPOOL/$TESTCLONE
|
||||
}
|
||||
|
||||
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
|
||||
ORIGINAL_MIN=$(get_tunable zfs_livelist_min_percent_shared)
|
||||
|
||||
log_onexit cleanup
|
||||
log_must zfs create $TESTPOOL/$TESTFS1
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
|
||||
test_condense
|
||||
test_deactivated
|
||||
|
||||
log_pass "Clone's livelist condenses and disables as expected."
|
@ -0,0 +1,116 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
# DESCRIPTION
|
||||
# Test race conditions for livelist condensing
|
||||
|
||||
# STRATEGY
|
||||
# These tests exercise code paths that deal with a livelist being
|
||||
# simultaneously condensed and deactivated (deleted, exported or disabled).
|
||||
# If a variable is set, the zthr will pause until it is cancelled or waited
|
||||
# and then a counter variable keeps track of whether or not the code path is
|
||||
# reached.
|
||||
|
||||
# 1. Deletion race: repeatedly overwrite the same file to trigger condense
|
||||
# and then delete the clone.
|
||||
# 2. Disable race: Overwrite enough files to trigger condenses and disabling of
|
||||
# the livelist.
|
||||
# 3. Export race: repeatedly overwrite the same file to trigger condense and
|
||||
# then export the pool.
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
|
||||
# reset the livelist sublist size to the original value
|
||||
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
|
||||
# reset the condense tests to 0
|
||||
set_tunable32 zfs_livelist_condense_zthr_pause 0
|
||||
set_tunable32 zfs_livelist_condense_sync_pause 0
|
||||
}
|
||||
|
||||
function delete_race
|
||||
{
|
||||
set_tunable32 "$1" 0
|
||||
log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE
|
||||
for i in {1..5}; do
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out
|
||||
done
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
log_must zpool sync $TESTPOOL
|
||||
[[ "1" == "$(get_tunable "$1")" ]] || \
|
||||
log_fail "delete/condense race test failed"
|
||||
}
|
||||
|
||||
function export_race
|
||||
{
|
||||
set_tunable32 "$1" 0
|
||||
log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE
|
||||
for i in {1..5}; do
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out
|
||||
done
|
||||
log_must zpool export $TESTPOOL
|
||||
log_must zpool import $TESTPOOL
|
||||
[[ "1" == "$(get_tunable "$1")" ]] || \
|
||||
log_fail "export/condense race test failed"
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
}
|
||||
|
||||
function disable_race
|
||||
{
|
||||
set_tunable32 "$1" 0
|
||||
log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE
|
||||
for i in {1..5}; do
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out
|
||||
done
|
||||
# overwrite the file shared with the origin to trigger disable
|
||||
log_must mkfile 100m /$TESTPOOL/$TESTCLONE/atestfile
|
||||
log_must zpool sync $TESTPOOL
|
||||
[[ "1" == "$(get_tunable "$1")" ]] || \
|
||||
log_fail "disable/condense race test failed"
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
}
|
||||
|
||||
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_must zfs create $TESTPOOL/$TESTFS1
|
||||
log_must mkfile 100m /$TESTPOOL/$TESTFS1/atestfile
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
|
||||
|
||||
# Reduce livelist size to trigger condense more easily
|
||||
set_tunable64 zfs_livelist_max_entries 0x14
|
||||
|
||||
# Test cancellation path in the zthr
|
||||
set_tunable32 zfs_livelist_condense_zthr_pause 1
|
||||
set_tunable32 zfs_livelist_condense_sync_pause 0
|
||||
disable_race "zfs_livelist_condense_zthr_cancel"
|
||||
delete_race "zfs_livelist_condense_zthr_cancel"
|
||||
export_race "zfs_livelist_condense_zthr_cancel"
|
||||
|
||||
# Test cancellation path in the synctask
|
||||
set_tunable32 zfs_livelist_condense_zthr_pause 0
|
||||
set_tunable32 zfs_livelist_condense_sync_pause 1
|
||||
disable_race "zfs_livelist_condense_sync_cancel"
|
||||
delete_race "zfs_livelist_condense_sync_cancel"
|
||||
|
||||
log_pass "Clone livelist condense race conditions passed."
|
@ -0,0 +1,140 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
# DESCRIPTION
|
||||
# Verify zfs destroy test for clones with the livelist feature
|
||||
# enabled.
|
||||
|
||||
# STRATEGY
|
||||
# 1. One clone with an empty livelist
|
||||
# - create the clone, check that livelist exists
|
||||
# - delete the clone, check that livelist is eventually
|
||||
# destroyed
|
||||
# 2. One clone with populated livelist
|
||||
# - create the clone, check that livelist exists
|
||||
# - write multiple files to the clone
|
||||
# - delete the clone, check that livelist is eventually
|
||||
# destroyed
|
||||
# 3. Multiple clones with empty livelists
|
||||
# - same as 1. but with multiple clones
|
||||
# 4. Multuple clones with populated livelists
|
||||
# - same as 2. but with multiple clones
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TESTPOOL/$TESTFS1 && zfs destroy -R $TESTPOOL/$TESTFS1
|
||||
# reset the livelist sublist size to its original value
|
||||
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
|
||||
}
|
||||
|
||||
function clone_write_file
|
||||
{
|
||||
log_must mkfile 1m /$TESTPOOL/$1/$2
|
||||
log_must zpool sync $TESTPOOL
|
||||
}
|
||||
|
||||
function test_one_empty
|
||||
{
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
check_livelist_gone
|
||||
}
|
||||
|
||||
function test_one
|
||||
{
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
clone_write_file $TESTCLONE $TESTFILE0
|
||||
clone_write_file $TESTCLONE $TESTFILE1
|
||||
clone_write_file $TESTCLONE $TESTFILE2
|
||||
log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE0
|
||||
log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE2
|
||||
check_livelist_exists $TESTCLONE
|
||||
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
check_livelist_gone
|
||||
}
|
||||
|
||||
function test_multiple_empty
|
||||
{
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE1
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE2
|
||||
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE1
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE2
|
||||
check_livelist_gone
|
||||
}
|
||||
|
||||
function test_multiple
|
||||
{
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE1
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE2
|
||||
|
||||
clone_write_file $TESTCLONE $TESTFILE0
|
||||
|
||||
clone_write_file $TESTCLONE1 $TESTFILE0
|
||||
clone_write_file $TESTCLONE1 $TESTFILE1
|
||||
clone_write_file $TESTCLONE1 $TESTFILE2
|
||||
|
||||
clone_write_file $TESTCLONE2 $TESTFILE0
|
||||
log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE0
|
||||
clone_write_file $TESTCLONE2 $TESTFILE1
|
||||
log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE1
|
||||
|
||||
check_livelist_exists $TESTCLONE
|
||||
check_livelist_exists $TESTCLONE1
|
||||
check_livelist_exists $TESTCLONE2
|
||||
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE1
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE2
|
||||
check_livelist_gone
|
||||
}
|
||||
|
||||
function test_promote
|
||||
{
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
log_must zfs promote $TESTPOOL/$TESTCLONE
|
||||
check_livelist_gone
|
||||
log_must zfs destroy -R $TESTPOOL/$TESTCLONE
|
||||
}
|
||||
|
||||
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
|
||||
|
||||
log_onexit cleanup
|
||||
log_must zfs create $TESTPOOL/$TESTFS1
|
||||
log_must mkfile 20m /$TESTPOOL/$TESTFS1/atestfile
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
|
||||
|
||||
# set a small livelist entry size to more easily test multiple entry livelists
|
||||
set_tunable64 zfs_livelist_max_entries 0x14
|
||||
|
||||
test_one_empty
|
||||
test_one
|
||||
test_multiple_empty
|
||||
test_multiple
|
||||
test_promote
|
||||
|
||||
log_pass "Clone with the livelist feature enabled could be destroyed," \
|
||||
"also could be promoted and destroyed as expected."
|
@ -25,7 +25,7 @@
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
||||
# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
@ -146,3 +146,43 @@ function check_dataset
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# Use zdb to see if a livelist exists for a given clone
|
||||
# $1 clone name
|
||||
function check_livelist_exists
|
||||
{
|
||||
zdb -vvvvv $TESTPOOL/$1 | grep "Livelist" || \
|
||||
log_fail "zdb could not find Livelist"
|
||||
}
|
||||
|
||||
# Wait for the deferred destroy livelists to be removed
|
||||
function wait_for_deferred_destroy
|
||||
{
|
||||
sync
|
||||
deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist")
|
||||
while [[ "$deleted" != "" ]]; do
|
||||
deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist")
|
||||
done
|
||||
}
|
||||
|
||||
# Check that a livelist has been removed, waiting for deferred destroy entries
|
||||
# to be cleared from zdb.
|
||||
function check_livelist_gone
|
||||
{
|
||||
wait_for_deferred_destroy
|
||||
zdb -vvvvv $TESTPOOL | grep "Livelist" && \
|
||||
log_fail "zdb found Livelist after the clone is deleted."
|
||||
}
|
||||
|
||||
# Create a clone in the testpool based on $TESTFS@snap. Verify that the clone
|
||||
# was created and that it includes a livelist
|
||||
# $1 fs name
|
||||
# $2 snap name
|
||||
# $3 clone name
|
||||
function clone_dataset
|
||||
{
|
||||
log_must zfs clone $TESTPOOL/$1@$2 $TESTPOOL/$3
|
||||
datasetexists $TESTPOOL/$3 || \
|
||||
log_fail "zfs clone $TESTPOOL/$3 fail."
|
||||
check_livelist_exists $3
|
||||
}
|
||||
|
@ -0,0 +1,68 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
# DESCRIPTION
|
||||
# Verify that livelists tracking remapped blocks can be
|
||||
# properly destroyed.
|
||||
|
||||
# STRATEGY
|
||||
# 1. Create a pool with disk1 and create a filesystem, snapshot
|
||||
# and clone. Write several files to the clone.
|
||||
# 2. Add disk2 to the pool and then remove disk1, triggering a
|
||||
# remap of the blkptrs tracked in the livelist.
|
||||
# 3. Delete the clone
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/removal/removal.kshlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2
|
||||
[[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1
|
||||
[[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
VIRTUAL_DISK1=/var/tmp/disk1
|
||||
VIRTUAL_DISK2=/var/tmp/disk2
|
||||
log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1
|
||||
log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2
|
||||
|
||||
log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1
|
||||
log_must poolexists $TESTPOOL2
|
||||
|
||||
log_must zfs create $TESTPOOL2/$TESTFS
|
||||
log_must mkfile 25m /$TESTPOOL2/$TESTFS/atestfile
|
||||
log_must zfs snapshot $TESTPOOL2/$TESTFS@snap
|
||||
|
||||
log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE
|
||||
|
||||
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE0
|
||||
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE1
|
||||
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE2
|
||||
|
||||
log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2
|
||||
log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1
|
||||
wait_for_removal $TESTPOOL2
|
||||
|
||||
log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE0
|
||||
log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE1
|
||||
|
||||
log_must zfs destroy $TESTPOOL2/$TESTCLONE
|
||||
|
||||
log_pass "Clone with the livelist feature and remapped blocks," \
|
||||
"can be destroyed."
|
@ -0,0 +1,93 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2018 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
# DESCRIPTION
|
||||
# Verify that livelists tracking remapped blocks can be
|
||||
# properly condensed.
|
||||
|
||||
# STRATEGY
|
||||
# 1. Create a pool with disk1 and create a filesystem, snapshot
|
||||
# and clone. Create two files for the first livelist entry and
|
||||
# pause condensing.
|
||||
# 2. Add disk2 to the pool and then remove disk1, triggering a
|
||||
# remap of the blkptrs tracked in the livelist.
|
||||
# 3. Overwrite the first file several times to trigger a condense,
|
||||
# overwrite the second file once and resume condensing, now with
|
||||
# extra blkptrs added during the remap
|
||||
# 4. Check that the test added new ALLOC blkptrs mid-condense using
|
||||
# a variable set in that code path
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/removal/removal.kshlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2
|
||||
# reset livelist max size
|
||||
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
|
||||
[[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1
|
||||
[[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
|
||||
set_tunable64 zfs_livelist_max_entries 0x14
|
||||
|
||||
VIRTUAL_DISK1=/var/tmp/disk1
|
||||
VIRTUAL_DISK2=/var/tmp/disk2
|
||||
log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1
|
||||
log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2
|
||||
|
||||
log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1
|
||||
log_must poolexists $TESTPOOL2
|
||||
|
||||
log_must zfs create $TESTPOOL2/$TESTFS
|
||||
log_must mkfile 100m /$TESTPOOL2/$TESTFS/atestfile
|
||||
log_must zfs snapshot $TESTPOOL2/$TESTFS@snap
|
||||
|
||||
log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE
|
||||
|
||||
# Create inital files and pause condense zthr on next execution
|
||||
log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A
|
||||
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B
|
||||
log_must zpool sync $TESTPOOL2
|
||||
set_tunable32 zfs_livelist_condense_sync_pause 1
|
||||
|
||||
# Add a new dev and remove the old one
|
||||
log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2
|
||||
log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1
|
||||
wait_for_removal $TESTPOOL2
|
||||
|
||||
set_tunable32 zfs_livelist_condense_new_alloc 0
|
||||
# Trigger a condense
|
||||
log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A
|
||||
log_must zpool sync $TESTPOOL2
|
||||
log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A
|
||||
log_must zpool sync $TESTPOOL2
|
||||
# Write remapped blkptrs which will modify the livelist mid-condense
|
||||
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B
|
||||
|
||||
# Resume condense thr
|
||||
set_tunable32 zfs_livelist_condense_sync_pause 0
|
||||
log_must zpool sync $TESTPOOL2
|
||||
# Check that we've added new ALLOC blkptrs during the condense
|
||||
[[ "0" < "$(get_tunable zfs_livelist_condense_new_alloc)" ]] || \
|
||||
log_fail "removal/condense test failed"
|
||||
|
||||
log_must zfs destroy $TESTPOOL2/$TESTCLONE
|
||||
log_pass "Clone with the livelist feature and remapped blocks," \
|
||||
"can be condensed."
|
@ -93,5 +93,6 @@ if is_linux; then
|
||||
"feature@allocation_classes"
|
||||
"feature@resilver_defer"
|
||||
"feature@bookmark_v2"
|
||||
"feature@livelist"
|
||||
)
|
||||
fi
|
||||
|
Loading…
Reference in New Issue
Block a user