mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Livelist logic should handle dedup blkptrs
Update the logic to handle the dedup-case of consecutive FREEs in the livelist code. The logic still ensures that all the FREE entries are matched up with a respective ALLOC by keeping a refcount for each FREE blkptr that we encounter and ensuring that this refcount gets to zero by the time we are done processing the livelist. zdb -y no longer panics when encountering double frees Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Reviewed-by: Don Brady <don.brady@delphix.com> Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com> Closes #11480 Closes #12177
This commit is contained in:
		
							parent
							
								
									6b9d0eda75
								
							
						
					
					
						commit
						abd0b59e48
					
				
							
								
								
									
										117
									
								
								cmd/zdb/zdb.c
									
									
									
									
									
								
							
							
						
						
									
										117
									
								
								cmd/zdb/zdb.c
									
									
									
									
									
								
							| @ -161,12 +161,6 @@ static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, | ||||
|     dmu_tx_t *tx); | ||||
| 
 | ||||
| typedef struct sublivelist_verify { | ||||
| 	/* all ALLOC'd blkptr_t in one sub-livelist */ | ||||
| 	zfs_btree_t sv_all_allocs; | ||||
| 
 | ||||
| 	/* all FREE'd blkptr_t in one sub-livelist */ | ||||
| 	zfs_btree_t sv_all_frees; | ||||
| 
 | ||||
| 	/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ | ||||
| 	zfs_btree_t sv_pair; | ||||
| 
 | ||||
| @ -225,29 +219,68 @@ typedef struct sublivelist_verify_block { | ||||
| 
 | ||||
| static void zdb_print_blkptr(const blkptr_t *bp, int flags); | ||||
| 
 | ||||
| typedef struct sublivelist_verify_block_refcnt { | ||||
| 	/* block pointer entry in livelist being verified */ | ||||
| 	blkptr_t svbr_blk; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Refcount gets incremented to 1 when we encounter the first | ||||
| 	 * FREE entry for the svfbr block pointer and a node for it | ||||
| 	 * is created in our ZDB verification/tracking metadata. | ||||
| 	 * | ||||
| 	 * As we encounter more FREE entries we increment this counter | ||||
| 	 * and similarly decrement it whenever we find the respective | ||||
| 	 * ALLOC entries for this block. | ||||
| 	 * | ||||
| 	 * When the refcount gets to 0 it means that all the FREE and | ||||
| 	 * ALLOC entries of this block have paired up and we no longer | ||||
| 	 * need to track it in our verification logic (e.g. the node | ||||
| 	 * containing this struct in our verification data structure | ||||
| 	 * should be freed). | ||||
| 	 * | ||||
| 	 * [refer to sublivelist_verify_blkptr() for the actual code] | ||||
| 	 */ | ||||
| 	uint32_t svbr_refcnt; | ||||
| } sublivelist_verify_block_refcnt_t; | ||||
| 
 | ||||
| static int | ||||
| sublivelist_block_refcnt_compare(const void *larg, const void *rarg) | ||||
| { | ||||
| 	const sublivelist_verify_block_refcnt_t *l = larg; | ||||
| 	const sublivelist_verify_block_refcnt_t *r = rarg; | ||||
| 	return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT3P(tx, ==, NULL); | ||||
| 	struct sublivelist_verify *sv = arg; | ||||
| 	char blkbuf[BP_SPRINTF_LEN]; | ||||
| 	sublivelist_verify_block_refcnt_t current = { | ||||
| 			.svbr_blk = *bp, | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * Start with 1 in case this is the first free entry. | ||||
| 			 * This field is not used for our B-Tree comparisons | ||||
| 			 * anyway. | ||||
| 			 */ | ||||
| 			.svbr_refcnt = 1, | ||||
| 	}; | ||||
| 
 | ||||
| 	zfs_btree_index_t where; | ||||
| 	sublivelist_verify_block_refcnt_t *pair = | ||||
| 	    zfs_btree_find(&sv->sv_pair, ¤t, &where); | ||||
| 	if (free) { | ||||
| 		zfs_btree_add(&sv->sv_pair, bp); | ||||
| 		/* Check if the FREE is a duplicate */ | ||||
| 		if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) { | ||||
| 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, | ||||
| 			    free); | ||||
| 			(void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf); | ||||
| 		if (pair == NULL) { | ||||
| 			/* first free entry for this block pointer */ | ||||
| 			zfs_btree_add(&sv->sv_pair, ¤t); | ||||
| 		} else { | ||||
| 			zfs_btree_add_idx(&sv->sv_all_frees, bp, &where); | ||||
| 			pair->svbr_refcnt++; | ||||
| 		} | ||||
| 	} else { | ||||
| 		/* Check if the ALLOC has been freed */ | ||||
| 		if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) { | ||||
| 			zfs_btree_remove_idx(&sv->sv_pair, &where); | ||||
| 		} else { | ||||
| 		if (pair == NULL) { | ||||
| 			/* block that is currently marked as allocated */ | ||||
| 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) { | ||||
| 				if (DVA_IS_EMPTY(&bp->blk_dva[i])) | ||||
| 					break; | ||||
| @ -262,16 +295,16 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, | ||||
| 					    &svb, &where); | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 		/* Check if the ALLOC is a duplicate */ | ||||
| 		if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) { | ||||
| 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, | ||||
| 			    free); | ||||
| 			(void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf); | ||||
| 		} else { | ||||
| 			zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where); | ||||
| 			/* alloc matches a free entry */ | ||||
| 			pair->svbr_refcnt--; | ||||
| 			if (pair->svbr_refcnt == 0) { | ||||
| 				/* all allocs and frees have been matched */ | ||||
| 				zfs_btree_remove_idx(&sv->sv_pair, &where); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| @ -279,32 +312,22 @@ static int | ||||
| sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) | ||||
| { | ||||
| 	int err; | ||||
| 	char blkbuf[BP_SPRINTF_LEN]; | ||||
| 	struct sublivelist_verify *sv = args; | ||||
| 
 | ||||
| 	zfs_btree_create(&sv->sv_all_allocs, livelist_compare, | ||||
| 	    sizeof (blkptr_t)); | ||||
| 
 | ||||
| 	zfs_btree_create(&sv->sv_all_frees, livelist_compare, | ||||
| 	    sizeof (blkptr_t)); | ||||
| 
 | ||||
| 	zfs_btree_create(&sv->sv_pair, livelist_compare, | ||||
| 	    sizeof (blkptr_t)); | ||||
| 	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, | ||||
| 	    sizeof (sublivelist_verify_block_refcnt_t)); | ||||
| 
 | ||||
| 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, | ||||
| 	    sv, NULL); | ||||
| 
 | ||||
| 	zfs_btree_clear(&sv->sv_all_allocs); | ||||
| 	zfs_btree_destroy(&sv->sv_all_allocs); | ||||
| 
 | ||||
| 	zfs_btree_clear(&sv->sv_all_frees); | ||||
| 	zfs_btree_destroy(&sv->sv_all_frees); | ||||
| 
 | ||||
| 	blkptr_t *e; | ||||
| 	sublivelist_verify_block_refcnt_t *e; | ||||
| 	zfs_btree_index_t *cookie = NULL; | ||||
| 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { | ||||
| 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE); | ||||
| 		(void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf); | ||||
| 		char blkbuf[BP_SPRINTF_LEN]; | ||||
| 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), | ||||
| 		    &e->svbr_blk, B_TRUE); | ||||
| 		(void) printf("\tERROR: %d unmatched FREE(s): %s\n", | ||||
| 		    e->svbr_refcnt, blkbuf); | ||||
| 	} | ||||
| 	zfs_btree_destroy(&sv->sv_pair); | ||||
| 
 | ||||
| @ -613,10 +636,14 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) | ||||
| /*
 | ||||
|  * [Livelist Check] | ||||
|  * Iterate through all the sublivelists and: | ||||
|  * - report leftover frees | ||||
|  * - report double ALLOCs/FREEs | ||||
|  * - report leftover frees (**) | ||||
|  * - record leftover ALLOCs together with their TXG [see Cross Check] | ||||
|  * | ||||
|  * (**) Note: Double ALLOCs are valid in datasets that have dedup | ||||
|  *      enabled. Similarly double FREEs are allowed as well but | ||||
|  *      only if they pair up with a corresponding ALLOC entry once | ||||
|  *      we our done with our sublivelist iteration. | ||||
|  * | ||||
|  * [Spacemap Check] | ||||
|  * for each metaslab: | ||||
|  * - iterate over spacemap and then the metaslab's entries in the | ||||
|  | ||||
| @ -909,15 +909,16 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, | ||||
| } | ||||
| 
 | ||||
| typedef struct livelist_entry { | ||||
| 	const blkptr_t *le_bp; | ||||
| 	blkptr_t le_bp; | ||||
| 	uint32_t le_refcnt; | ||||
| 	avl_node_t le_node; | ||||
| } livelist_entry_t; | ||||
| 
 | ||||
| static int | ||||
| livelist_compare(const void *larg, const void *rarg) | ||||
| { | ||||
| 	const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; | ||||
| 	const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; | ||||
| 	const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; | ||||
| 	const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; | ||||
| 
 | ||||
| 	/* Sort them according to dva[0] */ | ||||
| 	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); | ||||
| @ -944,6 +945,11 @@ struct livelist_iter_arg { | ||||
|  * Expects an AVL tree which is incrementally filled will FREE blkptrs | ||||
|  * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a | ||||
|  * corresponding FREE are stored in the supplied bplist. | ||||
|  * | ||||
|  * Note that multiple FREE and ALLOC entries for the same blkptr may | ||||
|  * be encountered when dedup is involved. For this reason we keep a | ||||
|  * refcount for all the FREE entries of each blkptr and ensure that | ||||
|  * each of those FREE entries has a corresponding ALLOC preceding it. | ||||
|  */ | ||||
| static int | ||||
| dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
| @ -957,23 +963,47 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
| 
 | ||||
| 	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) | ||||
| 		return (SET_ERROR(EINTR)); | ||||
| 	if (bp_freed) { | ||||
| 		livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), | ||||
| 		    KM_SLEEP); | ||||
| 		blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); | ||||
| 		*temp_bp = *bp; | ||||
| 		node->le_bp = temp_bp; | ||||
| 		avl_add(avl, node); | ||||
| 	} else { | ||||
| 
 | ||||
| 	livelist_entry_t node; | ||||
| 		node.le_bp = bp; | ||||
| 	node.le_bp = *bp; | ||||
| 	livelist_entry_t *found = avl_find(avl, &node, NULL); | ||||
| 		if (found != NULL) { | ||||
| 	if (bp_freed) { | ||||
| 		if (found == NULL) { | ||||
| 			/* first free entry for this blkptr */ | ||||
| 			livelist_entry_t *e = | ||||
| 			    kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); | ||||
| 			e->le_bp = *bp; | ||||
| 			e->le_refcnt = 1; | ||||
| 			avl_add(avl, e); | ||||
| 		} else { | ||||
| 			/* dedup block free */ | ||||
| 			ASSERT(BP_GET_DEDUP(bp)); | ||||
| 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, | ||||
| 			    BP_GET_CHECKSUM(&found->le_bp)); | ||||
| 			ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); | ||||
| 			found->le_refcnt++; | ||||
| 		} | ||||
| 	} else { | ||||
| 		if (found == NULL) { | ||||
| 			/* block is currently marked as allocated */ | ||||
| 			bplist_append(to_free, bp); | ||||
| 		} else { | ||||
| 			/* alloc matches a free entry */ | ||||
| 			ASSERT3U(found->le_refcnt, !=, 0); | ||||
| 			found->le_refcnt--; | ||||
| 			if (found->le_refcnt == 0) { | ||||
| 				/* all tracked free pairs have been matched */ | ||||
| 				avl_remove(avl, found); | ||||
| 			kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); | ||||
| 				kmem_free(found, sizeof (livelist_entry_t)); | ||||
| 			} else { | ||||
| 			bplist_append(to_free, bp); | ||||
| 				/*
 | ||||
| 				 * This is definitely a deduped blkptr so | ||||
| 				 * let's validate it. | ||||
| 				 */ | ||||
| 				ASSERT(BP_GET_DEDUP(bp)); | ||||
| 				ASSERT3U(BP_GET_CHECKSUM(bp), ==, | ||||
| 				    BP_GET_CHECKSUM(&found->le_bp)); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	return (0); | ||||
| @ -999,6 +1029,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, | ||||
| 	}; | ||||
| 	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); | ||||
| 
 | ||||
| 	VERIFY0(avl_numnodes(&avl)); | ||||
| 	avl_destroy(&avl); | ||||
| 	return (err); | ||||
| } | ||||
|  | ||||
| @ -165,8 +165,8 @@ tags = ['functional', 'cli_root', 'zfs_create'] | ||||
| 
 | ||||
| [tests/functional/cli_root/zfs_destroy] | ||||
| tests = ['zfs_clone_livelist_condense_and_disable', | ||||
|     'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', | ||||
|     'zfs_destroy_002_pos', 'zfs_destroy_003_pos', | ||||
|     'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', | ||||
|     'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', | ||||
|     'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', | ||||
|     'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', | ||||
|     'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', | ||||
|  | ||||
| @ -0,0 +1,88 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2021 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION | ||||
| # Verify zfs destroy test for clones with livelists that contain | ||||
| # dedup blocks. This test is a baseline regression test created | ||||
| # to ensure that past bugs that we've encountered between dedup | ||||
| # and the livelist logic don't resurface. | ||||
| 
 | ||||
| # STRATEGY | ||||
| # 1. Create a clone from a test filesystem and enable dedup. | ||||
| # 2. Write some data and create a livelist. | ||||
| # 3. Copy the data within the clone to create dedup blocks. | ||||
| # 4. Remove some of the dedup data to create multiple free | ||||
| #    entries for the same block pointers. | ||||
| # 5. Process all the livelist entries by destroying the clone. | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| . $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 | ||||
| 	# Reset the minimum percent shared to 75 | ||||
| 	set_tunable32 LIVELIST_MIN_PERCENT_SHARED $ORIGINAL_MIN_SHARED | ||||
| } | ||||
| 
 | ||||
| function test_dedup | ||||
| { | ||||
| 	# Set a small percent shared threshold so the livelist is not disabled | ||||
| 	set_tunable32 LIVELIST_MIN_PERCENT_SHARED 10 | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	# Enable dedup | ||||
| 	log_must zfs set dedup=on $TESTPOOL/$TESTCLONE | ||||
| 
 | ||||
| 	# Create some data to be deduped | ||||
| 	log_must dd if=/dev/urandom of="/$TESTPOOL/$TESTCLONE/data" bs=512 count=10k | ||||
| 
 | ||||
| 	# Create dedup blocks | ||||
| 	# Note: We sync before and after so all dedup blocks belong to the | ||||
| 	#       same TXG, otherwise they won't look identical to the livelist | ||||
| 	#       iterator due to their logical birth TXG being different. | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-0 | ||||
| 	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-1 | ||||
| 	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-2 | ||||
| 	log_must cp /$TESTPOOL/$TESTCLONE/data /$TESTPOOL/$TESTCLONE/data-dup-3 | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	check_livelist_exists $TESTCLONE | ||||
| 
 | ||||
| 	# Introduce "double frees" | ||||
| 	#   We want to introduce consecutive FREEs of the same block as this | ||||
| 	#   was what triggered past panics. | ||||
| 	# Note: Similarly to the previouys step we sync before and after our | ||||
| 	#       our deletions so all the entries end up in the same TXG. | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE/data-dup-2 | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE/data-dup-3 | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	check_livelist_exists $TESTCLONE | ||||
| 
 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	check_livelist_gone | ||||
| } | ||||
| 
 | ||||
| ORIGINAL_MIN_SHARED=$(get_tunable LIVELIST_MIN_PERCENT_SHARED) | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| log_must zfs create $TESTPOOL/$TESTFS1 | ||||
| log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile | ||||
| log_must zfs snapshot $TESTPOOL/$TESTFS1@snap | ||||
| test_dedup | ||||
| 
 | ||||
| log_pass "Clone's livelist processes dedup blocks as expected." | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Serapheim Dimitropoulos
						Serapheim Dimitropoulos