mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-26 09:54:22 +03:00
Livelist logic should handle dedup blkptrs
Update the logic to handle the dedup-case of consecutive FREEs in the livelist code. The logic still ensures that all the FREE entries are matched up with a respective ALLOC by keeping a refcount for each FREE blkptr that we encounter and ensuring that this refcount gets to zero by the time we are done processing the livelist. zdb -y no longer panics when encountering double frees Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Reviewed-by: Don Brady <don.brady@delphix.com> Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com> Closes #11480 Closes #12177
This commit is contained in:
parent
ea400129c3
commit
86b5f4c121
117
cmd/zdb/zdb.c
117
cmd/zdb/zdb.c
@ -163,12 +163,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
|
||||
dmu_tx_t *tx);
|
||||
|
||||
typedef struct sublivelist_verify {
|
||||
/* all ALLOC'd blkptr_t in one sub-livelist */
|
||||
zfs_btree_t sv_all_allocs;
|
||||
|
||||
/* all FREE'd blkptr_t in one sub-livelist */
|
||||
zfs_btree_t sv_all_frees;
|
||||
|
||||
/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
|
||||
zfs_btree_t sv_pair;
|
||||
|
||||
@ -227,29 +221,68 @@ typedef struct sublivelist_verify_block {
|
||||
|
||||
static void zdb_print_blkptr(const blkptr_t *bp, int flags);
|
||||
|
||||
typedef struct sublivelist_verify_block_refcnt {
|
||||
/* block pointer entry in livelist being verified */
|
||||
blkptr_t svbr_blk;
|
||||
|
||||
/*
|
||||
* Refcount gets incremented to 1 when we encounter the first
|
||||
* FREE entry for the svfbr block pointer and a node for it
|
||||
* is created in our ZDB verification/tracking metadata.
|
||||
*
|
||||
* As we encounter more FREE entries we increment this counter
|
||||
* and similarly decrement it whenever we find the respective
|
||||
* ALLOC entries for this block.
|
||||
*
|
||||
* When the refcount gets to 0 it means that all the FREE and
|
||||
* ALLOC entries of this block have paired up and we no longer
|
||||
* need to track it in our verification logic (e.g. the node
|
||||
* containing this struct in our verification data structure
|
||||
* should be freed).
|
||||
*
|
||||
* [refer to sublivelist_verify_blkptr() for the actual code]
|
||||
*/
|
||||
uint32_t svbr_refcnt;
|
||||
} sublivelist_verify_block_refcnt_t;
|
||||
|
||||
static int
|
||||
sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
|
||||
{
|
||||
const sublivelist_verify_block_refcnt_t *l = larg;
|
||||
const sublivelist_verify_block_refcnt_t *r = rarg;
|
||||
return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
|
||||
}
|
||||
|
||||
static int
|
||||
sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT3P(tx, ==, NULL);
|
||||
struct sublivelist_verify *sv = arg;
|
||||
char blkbuf[BP_SPRINTF_LEN];
|
||||
sublivelist_verify_block_refcnt_t current = {
|
||||
.svbr_blk = *bp,
|
||||
|
||||
/*
|
||||
* Start with 1 in case this is the first free entry.
|
||||
* This field is not used for our B-Tree comparisons
|
||||
* anyway.
|
||||
*/
|
||||
.svbr_refcnt = 1,
|
||||
};
|
||||
|
||||
zfs_btree_index_t where;
|
||||
sublivelist_verify_block_refcnt_t *pair =
|
||||
zfs_btree_find(&sv->sv_pair, ¤t, &where);
|
||||
if (free) {
|
||||
zfs_btree_add(&sv->sv_pair, bp);
|
||||
/* Check if the FREE is a duplicate */
|
||||
if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
|
||||
free);
|
||||
(void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
|
||||
if (pair == NULL) {
|
||||
/* first free entry for this block pointer */
|
||||
zfs_btree_add(&sv->sv_pair, ¤t);
|
||||
} else {
|
||||
zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
|
||||
pair->svbr_refcnt++;
|
||||
}
|
||||
} else {
|
||||
/* Check if the ALLOC has been freed */
|
||||
if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
|
||||
zfs_btree_remove_idx(&sv->sv_pair, &where);
|
||||
} else {
|
||||
if (pair == NULL) {
|
||||
/* block that is currently marked as allocated */
|
||||
for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
|
||||
if (DVA_IS_EMPTY(&bp->blk_dva[i]))
|
||||
break;
|
||||
@ -264,16 +297,16 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
|
||||
&svb, &where);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Check if the ALLOC is a duplicate */
|
||||
if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
|
||||
free);
|
||||
(void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
|
||||
} else {
|
||||
zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
|
||||
/* alloc matches a free entry */
|
||||
pair->svbr_refcnt--;
|
||||
if (pair->svbr_refcnt == 0) {
|
||||
/* all allocs and frees have been matched */
|
||||
zfs_btree_remove_idx(&sv->sv_pair, &where);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -281,32 +314,22 @@ static int
|
||||
sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
|
||||
{
|
||||
int err;
|
||||
char blkbuf[BP_SPRINTF_LEN];
|
||||
struct sublivelist_verify *sv = args;
|
||||
|
||||
zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
|
||||
sizeof (blkptr_t));
|
||||
|
||||
zfs_btree_create(&sv->sv_all_frees, livelist_compare,
|
||||
sizeof (blkptr_t));
|
||||
|
||||
zfs_btree_create(&sv->sv_pair, livelist_compare,
|
||||
sizeof (blkptr_t));
|
||||
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
|
||||
sizeof (sublivelist_verify_block_refcnt_t));
|
||||
|
||||
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
|
||||
sv, NULL);
|
||||
|
||||
zfs_btree_clear(&sv->sv_all_allocs);
|
||||
zfs_btree_destroy(&sv->sv_all_allocs);
|
||||
|
||||
zfs_btree_clear(&sv->sv_all_frees);
|
||||
zfs_btree_destroy(&sv->sv_all_frees);
|
||||
|
||||
blkptr_t *e;
|
||||
sublivelist_verify_block_refcnt_t *e;
|
||||
zfs_btree_index_t *cookie = NULL;
|
||||
while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
|
||||
(void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
|
||||
char blkbuf[BP_SPRINTF_LEN];
|
||||
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
|
||||
&e->svbr_blk, B_TRUE);
|
||||
(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
|
||||
e->svbr_refcnt, blkbuf);
|
||||
}
|
||||
zfs_btree_destroy(&sv->sv_pair);
|
||||
|
||||
@ -615,10 +638,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
|
||||
/*
|
||||
* [Livelist Check]
|
||||
* Iterate through all the sublivelists and:
|
||||
* - report leftover frees
|
||||
* - report double ALLOCs/FREEs
|
||||
* - report leftover frees (**)
|
||||
* - record leftover ALLOCs together with their TXG [see Cross Check]
|
||||
*
|
||||
* (**) Note: Double ALLOCs are valid in datasets that have dedup
|
||||
* enabled. Similarly double FREEs are allowed as well but
|
||||
* only if they pair up with a corresponding ALLOC entry once
|
||||
* we our done with our sublivelist iteration.
|
||||
*
|
||||
* [Spacemap Check]
|
||||
* for each metaslab:
|
||||
* - iterate over spacemap and then the metaslab's entries in the
|
||||
|
@ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
|
||||
}
|
||||
|
||||
typedef struct livelist_entry {
|
||||
const blkptr_t *le_bp;
|
||||
blkptr_t le_bp;
|
||||
uint32_t le_refcnt;
|
||||
avl_node_t le_node;
|
||||
} livelist_entry_t;
|
||||
|
||||
static int
|
||||
livelist_compare(const void *larg, const void *rarg)
|
||||
{
|
||||
const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
|
||||
const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
|
||||
const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
|
||||
const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
|
||||
|
||||
/* Sort them according to dva[0] */
|
||||
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
|
||||
@ -944,6 +945,11 @@ struct livelist_iter_arg {
|
||||
* Expects an AVL tree which is incrementally filled will FREE blkptrs
|
||||
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
|
||||
* corresponding FREE are stored in the supplied bplist.
|
||||
*
|
||||
* Note that multiple FREE and ALLOC entries for the same blkptr may
|
||||
* be encountered when dedup is involved. For this reason we keep a
|
||||
* refcount for all the FREE entries of each blkptr and ensure that
|
||||
* each of those FREE entries has a corresponding ALLOC preceding it.
|
||||
*/
|
||||
static int
|
||||
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
@ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
|
||||
|
||||
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
|
||||
return (SET_ERROR(EINTR));
|
||||
|
||||
livelist_entry_t node;
|
||||
node.le_bp = *bp;
|
||||
livelist_entry_t *found = avl_find(avl, &node, NULL);
|
||||
if (bp_freed) {
|
||||
livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
|
||||
KM_SLEEP);
|
||||
blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
|
||||
*temp_bp = *bp;
|
||||
node->le_bp = temp_bp;
|
||||
avl_add(avl, node);
|
||||
} else {
|
||||
livelist_entry_t node;
|
||||
node.le_bp = bp;
|
||||
livelist_entry_t *found = avl_find(avl, &node, NULL);
|
||||
if (found != NULL) {
|
||||
avl_remove(avl, found);
|
||||
kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
|
||||
kmem_free(found, sizeof (livelist_entry_t));
|
||||
if (found == NULL) {
|
||||
/* first free entry for this blkptr */
|
||||
livelist_entry_t *e =
|
||||
kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
|
||||
e->le_bp = *bp;
|
||||
e->le_refcnt = 1;
|
||||
avl_add(avl, e);
|
||||
} else {
|
||||
/* dedup block free */
|
||||
ASSERT(BP_GET_DEDUP(bp));
|
||||
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
|
||||
BP_GET_CHECKSUM(&found->le_bp));
|
||||
ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
|
||||
found->le_refcnt++;
|
||||
}
|
||||
} else {
|
||||
if (found == NULL) {
|
||||
/* block is currently marked as allocated */
|
||||
bplist_append(to_free, bp);
|
||||
} else {
|
||||
/* alloc matches a free entry */
|
||||
ASSERT3U(found->le_refcnt, !=, 0);
|
||||
found->le_refcnt--;
|
||||
if (found->le_refcnt == 0) {
|
||||
/* all tracked free pairs have been matched */
|
||||
avl_remove(avl, found);
|
||||
kmem_free(found, sizeof (livelist_entry_t));
|
||||
} else {
|
||||
/*
|
||||
* This is definitely a deduped blkptr so
|
||||
* let's validate it.
|
||||
*/
|
||||
ASSERT(BP_GET_DEDUP(bp));
|
||||
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
|
||||
BP_GET_CHECKSUM(&found->le_bp));
|
||||
}
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
@ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
|
||||
};
|
||||
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
|
||||
|
||||
VERIFY0(avl_numnodes(&avl));
|
||||
avl_destroy(&avl);
|
||||
return (err);
|
||||
}
|
||||
|
@ -166,8 +166,8 @@ tags = ['functional', 'cli_root', 'zfs_create']
|
||||
|
||||
[tests/functional/cli_root/zfs_destroy]
|
||||
tests = ['zfs_clone_livelist_condense_and_disable',
|
||||
'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
|
||||
'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
|
||||
'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
|
||||
'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
|
||||
'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
|
||||
'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
|
||||
'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
|
||||
|
@ -0,0 +1,88 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Delphix. All rights reserved.
|
||||
#
|
||||
|
||||
# DESCRIPTION
|
||||
# Verify zfs destroy test for clones with livelists that contain
|
||||
# dedup blocks. This test is a baseline regression test created
|
||||
# to ensure that past bugs that we've encountered between dedup
|
||||
# and the livelist logic don't resurface.
|
||||
|
||||
# STRATEGY
|
||||
# 1. Create a clone from a test filesystem and enable dedup.
|
||||
# 2. Write some data and create a livelist.
|
||||
# 3. Copy the data within the clone to create dedup blocks.
|
||||
# 4. Remove some of the dedup data to create multiple free
|
||||
# entries for the same block pointers.
|
||||
# 5. Process all the livelist entries by destroying the clone.
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
|
||||
# Reset the minimum percent shared to 75
|
||||
set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED
|
||||
}
|
||||
|
||||
function test_dedup
|
||||
{
|
||||
# Set a small percent shared threshold so the livelist is not disabled
|
||||
set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10
|
||||
clone_dataset $TESTFS1 snap $TESTCLONE
|
||||
|
||||
# Enable dedup
|
||||
log_must zfs set dedup=on $TESTPOOL/$TESTCLONE
|
||||
|
||||
# Create some data to be deduped
|
||||
log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k
|
||||
|
||||
# Create dedup blocks
|
||||
# Note: We sync before and after so all dedup blocks belong to the
|
||||
# same TXG, otherwise they won't look identical to the livelist
|
||||
# iterator due to their logical birth TXG being different.
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0
|
||||
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1
|
||||
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2
|
||||
log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3
|
||||
log_must zpool sync $TESTPOOL
|
||||
check_livelist_exists $TESTCLONE
|
||||
|
||||
# Introduce "double frees"
|
||||
# We want to introduce consecutive FREEs of the same block as this
|
||||
# was what triggered past panics.
|
||||
# Note: Similarly to the previouys step we sync before and after our
|
||||
# our deletions so all the entries end up in the same TXG.
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2
|
||||
log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3
|
||||
log_must zpool sync $TESTPOOL
|
||||
check_livelist_exists $TESTCLONE
|
||||
|
||||
log_must zfs destroy $TESTPOOL/$TESTCLONE
|
||||
check_livelist_gone
|
||||
}
|
||||
|
||||
ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED)
|
||||
|
||||
log_onexit cleanup
|
||||
log_must zfs create $TESTPOOL/$TESTFS1
|
||||
log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
|
||||
test_dedup
|
||||
|
||||
log_pass "Clone's livelist processes dedup blocks as expected."
|
Loading…
Reference in New Issue
Block a user