mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Fast Clone Deletion
Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Signed-off-by: Sara Hartse <sara.hartse@delphix.com> Closes #8416
This commit is contained in:
		
							parent
							
								
									d274ac5460
								
							
						
					
					
						commit
						37f03da8ba
					
				
							
								
								
									
										346
									
								
								cmd/zdb/zdb.c
									
									
									
									
									
								
							
							
						
						
									
										346
									
								
								cmd/zdb/zdb.c
									
									
									
									
									
								
							| @ -115,7 +115,8 @@ uint64_t max_inflight = 1000; | ||||
| static int leaked_objects = 0; | ||||
| static range_tree_t *mos_refd_objs; | ||||
| 
 | ||||
| static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); | ||||
| static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, | ||||
|     boolean_t); | ||||
| static void mos_obj_refd(uint64_t); | ||||
| static void mos_obj_refd_multiple(uint64_t); | ||||
| 
 | ||||
| @ -552,12 +553,16 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) | ||||
| 		(void) printf("\t\tcomp = %s\n", comp); | ||||
| 		(void) printf("\t\tuncomp = %s\n", uncomp); | ||||
| 	} | ||||
| 	if (size >= sizeof (*bpop)) { | ||||
| 	if (size >= BPOBJ_SIZE_V2) { | ||||
| 		(void) printf("\t\tsubobjs = %llu\n", | ||||
| 		    (u_longlong_t)bpop->bpo_subobjs); | ||||
| 		(void) printf("\t\tnum_subobjs = %llu\n", | ||||
| 		    (u_longlong_t)bpop->bpo_num_subobjs); | ||||
| 	} | ||||
| 	if (size >= sizeof (*bpop)) { | ||||
| 		(void) printf("\t\tnum_freed = %llu\n", | ||||
| 		    (u_longlong_t)bpop->bpo_num_freed); | ||||
| 	} | ||||
| 
 | ||||
| 	if (dump_opt['d'] < 5) | ||||
| 		return; | ||||
| @ -572,7 +577,8 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) | ||||
| 			(void) printf("got error %u from dmu_read\n", err); | ||||
| 			break; | ||||
| 		} | ||||
| 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); | ||||
| 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, | ||||
| 		    BP_GET_FREE(&bp)); | ||||
| 		(void) printf("\t%s\n", blkbuf); | ||||
| 	} | ||||
| } | ||||
| @ -1508,7 +1514,8 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) | ||||
| snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, | ||||
|     boolean_t bp_freed) | ||||
| { | ||||
| 	const dva_t *dva = bp->blk_dva; | ||||
| 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; | ||||
| @ -1516,6 +1523,10 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) | ||||
| 
 | ||||
| 	if (dump_opt['b'] >= 6) { | ||||
| 		snprintf_blkptr(blkbuf, buflen, bp); | ||||
| 		if (bp_freed) { | ||||
| 			(void) snprintf(blkbuf + strlen(blkbuf), | ||||
| 			    buflen - strlen(blkbuf), " %s", "FREE"); | ||||
| 		} | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| @ -1553,6 +1564,9 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) | ||||
| 		    (u_longlong_t)BP_GET_FILL(bp), | ||||
| 		    (u_longlong_t)bp->blk_birth, | ||||
| 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); | ||||
| 		if (bp_freed) | ||||
| 			(void) snprintf(blkbuf + strlen(blkbuf), | ||||
| 			    buflen - strlen(blkbuf), " %s", "FREE"); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| @ -1580,7 +1594,7 @@ print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); | ||||
| 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); | ||||
| 	(void) printf("%s\n", blkbuf); | ||||
| } | ||||
| 
 | ||||
| @ -1815,12 +1829,12 @@ dump_bptree(objset_t *os, uint64_t obj, const char *name) | ||||
| 
 | ||||
| /* ARGSUSED */ | ||||
| static int | ||||
| dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) | ||||
| { | ||||
| 	char blkbuf[BP_SPRINTF_LEN]; | ||||
| 
 | ||||
| 	ASSERT(bp->blk_birth != 0); | ||||
| 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); | ||||
| 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); | ||||
| 	(void) printf("\t%s\n", blkbuf); | ||||
| 	return (0); | ||||
| } | ||||
| @ -1845,14 +1859,28 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) | ||||
| 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { | ||||
| 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); | ||||
| 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); | ||||
| 		(void) printf("    %*s: object %llu, %llu local blkptrs, " | ||||
| 		    "%llu subobjs in object, %llu, %s (%s/%s comp)\n", | ||||
| 		if (bpo->bpo_havefreed) { | ||||
| 			(void) printf("    %*s: object %llu, %llu local " | ||||
| 			    "blkptrs, %llu freed, %llu subobjs in object %llu, " | ||||
| 			    "%s (%s/%s comp)\n", | ||||
| 			    indent * 8, name, | ||||
| 			    (u_longlong_t)bpo->bpo_object, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs, | ||||
| 			    bytes, comp, uncomp); | ||||
| 		} else { | ||||
| 			(void) printf("    %*s: object %llu, %llu local " | ||||
| 			    "blkptrs, %llu subobjs in object %llu, " | ||||
| 			    "%s (%s/%s comp)\n", | ||||
| 			    indent * 8, name, | ||||
| 			    (u_longlong_t)bpo->bpo_object, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs, | ||||
| 			    bytes, comp, uncomp); | ||||
| 		} | ||||
| 
 | ||||
| 		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { | ||||
| 			uint64_t subobj; | ||||
| @ -1872,12 +1900,23 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) | ||||
| 			bpobj_close(&subbpo); | ||||
| 		} | ||||
| 	} else { | ||||
| 		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n", | ||||
| 		if (bpo->bpo_havefreed) { | ||||
| 			(void) printf("    %*s: object %llu, %llu blkptrs, " | ||||
| 			    "%llu freed, %s\n", | ||||
| 			    indent * 8, name, | ||||
| 			    (u_longlong_t)bpo->bpo_object, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed, | ||||
| 			    bytes); | ||||
| 		} else { | ||||
| 			(void) printf("    %*s: object %llu, %llu blkptrs, " | ||||
| 			    "%s\n", | ||||
| 			    indent * 8, name, | ||||
| 			    (u_longlong_t)bpo->bpo_object, | ||||
| 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, | ||||
| 			    bytes); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (dump_opt['d'] < 5) | ||||
| 		return; | ||||
| @ -2038,58 +2077,20 @@ bpobj_count_refd(bpobj_t *bpo) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dump_deadlist(dsl_deadlist_t *dl) | ||||
| static int | ||||
| dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) | ||||
| { | ||||
| 	dsl_deadlist_entry_t *dle; | ||||
| 	uint64_t unused; | ||||
| 	char bytes[32]; | ||||
| 	char comp[32]; | ||||
| 	char uncomp[32]; | ||||
| 	uint64_t empty_bpobj = | ||||
| 	    dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; | ||||
| 
 | ||||
| 	/* force the tree to be loaded */ | ||||
| 	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); | ||||
| 
 | ||||
| 	if (dl->dl_oldfmt) { | ||||
| 		if (dl->dl_bpobj.bpo_object != empty_bpobj) | ||||
| 			bpobj_count_refd(&dl->dl_bpobj); | ||||
| 	} else { | ||||
| 		mos_obj_refd(dl->dl_object); | ||||
| 		for (dle = avl_first(&dl->dl_tree); dle; | ||||
| 		    dle = AVL_NEXT(&dl->dl_tree, dle)) { | ||||
| 	spa_t *spa = arg; | ||||
| 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; | ||||
| 	if (dle->dle_bpobj.bpo_object != empty_bpobj) | ||||
| 		bpobj_count_refd(&dle->dle_bpobj); | ||||
| 		} | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| 	/* make sure nicenum has enough space */ | ||||
| 	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); | ||||
| 	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); | ||||
| 	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); | ||||
| 
 | ||||
| 	if (dump_opt['d'] < 3) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (dl->dl_oldfmt) { | ||||
| 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); | ||||
| 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); | ||||
| 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); | ||||
| 	(void) printf("\n    Deadlist: %s (%s/%s comp)\n", | ||||
| 	    bytes, comp, uncomp); | ||||
| 
 | ||||
| 	if (dump_opt['d'] < 4) | ||||
| 		return; | ||||
| 
 | ||||
| 	(void) printf("\n"); | ||||
| 
 | ||||
| 	for (dle = avl_first(&dl->dl_tree); dle; | ||||
| 	    dle = AVL_NEXT(&dl->dl_tree, dle)) { | ||||
| static int | ||||
| dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) | ||||
| { | ||||
| 	ASSERT(arg == NULL); | ||||
| 	if (dump_opt['d'] >= 5) { | ||||
| 		char buf[128]; | ||||
| 		(void) snprintf(buf, sizeof (buf), | ||||
| @ -2103,7 +2104,98 @@ dump_deadlist(dsl_deadlist_t *dl) | ||||
| 		    (longlong_t)dle->dle_mintxg, | ||||
| 		    (longlong_t)dle->dle_bpobj.bpo_object); | ||||
| 	} | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dump_blkptr_list(dsl_deadlist_t *dl, char *name) | ||||
| { | ||||
| 	char bytes[32]; | ||||
| 	char comp[32]; | ||||
| 	char uncomp[32]; | ||||
| 	char entries[32]; | ||||
| 	spa_t *spa = dmu_objset_spa(dl->dl_os); | ||||
| 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; | ||||
| 
 | ||||
| 	if (dl->dl_oldfmt) { | ||||
| 		if (dl->dl_bpobj.bpo_object != empty_bpobj) | ||||
| 			bpobj_count_refd(&dl->dl_bpobj); | ||||
| 	} else { | ||||
| 		mos_obj_refd(dl->dl_object); | ||||
| 		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); | ||||
| 	} | ||||
| 
 | ||||
| 	/* make sure nicenum has enough space */ | ||||
| 	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); | ||||
| 	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); | ||||
| 	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); | ||||
| 	CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); | ||||
| 
 | ||||
| 	if (dump_opt['d'] < 3) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (dl->dl_oldfmt) { | ||||
| 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); | ||||
| 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); | ||||
| 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); | ||||
| 	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); | ||||
| 	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n", | ||||
| 	    name, bytes, comp, uncomp, entries); | ||||
| 
 | ||||
| 	if (dump_opt['d'] < 4) | ||||
| 		return; | ||||
| 
 | ||||
| 	(void) printf("\n"); | ||||
| 
 | ||||
| 	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| verify_dd_livelist(objset_t *os) | ||||
| { | ||||
| 	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; | ||||
| 	dsl_pool_t *dp = spa_get_dsl(os->os_spa); | ||||
| 	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir; | ||||
| 
 | ||||
| 	ASSERT(!dmu_objset_is_snapshot(os)); | ||||
| 	if (!dsl_deadlist_is_open(&dd->dd_livelist)) | ||||
| 		return (0); | ||||
| 	dsl_pool_config_enter(dp, FTAG); | ||||
| 	dsl_deadlist_space(&dd->dd_livelist, &ll_used, | ||||
| 	    &ll_comp, &ll_uncomp); | ||||
| 
 | ||||
| 	dsl_dataset_t *origin_ds; | ||||
| 	ASSERT(dsl_pool_config_held(dp)); | ||||
| 	VERIFY0(dsl_dataset_hold_obj(dp, | ||||
| 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); | ||||
| 	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, | ||||
| 	    &used, &comp, &uncomp)); | ||||
| 	dsl_dataset_rele(origin_ds, FTAG); | ||||
| 	dsl_pool_config_exit(dp, FTAG); | ||||
| 	/*
 | ||||
| 	 *  It's possible that the dataset's uncomp space is larger than the | ||||
| 	 *  livelist's because livelists do not track embedded block pointers | ||||
| 	 */ | ||||
| 	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { | ||||
| 		char nice_used[32], nice_comp[32], nice_uncomp[32]; | ||||
| 		(void) printf("Discrepancy in space accounting:\n"); | ||||
| 		zdb_nicenum(used, nice_used, sizeof (nice_used)); | ||||
| 		zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); | ||||
| 		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); | ||||
| 		(void) printf("dir: used %s, comp %s, uncomp %s\n", | ||||
| 		    nice_used, nice_comp, nice_uncomp); | ||||
| 		zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); | ||||
| 		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); | ||||
| 		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); | ||||
| 		(void) printf("livelist: used %s, comp %s, uncomp %s\n", | ||||
| 		    nice_used, nice_comp, nice_uncomp); | ||||
| 		return (1); | ||||
| 	} | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| static avl_tree_t idx_tree; | ||||
| @ -2643,7 +2735,7 @@ static const char *objset_types[DMU_OST_NUMTYPES] = { | ||||
| 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; | ||||
| 
 | ||||
| static void | ||||
| dump_dir(objset_t *os) | ||||
| dump_objset(objset_t *os) | ||||
| { | ||||
| 	dmu_objset_stats_t dds; | ||||
| 	uint64_t object, object_count; | ||||
| @ -2716,11 +2808,17 @@ dump_dir(objset_t *os) | ||||
| 
 | ||||
| 	if (dmu_objset_ds(os) != NULL) { | ||||
| 		dsl_dataset_t *ds = dmu_objset_ds(os); | ||||
| 		dump_deadlist(&ds->ds_deadlist); | ||||
| 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); | ||||
| 		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && | ||||
| 		    !dmu_objset_is_snapshot(os)) { | ||||
| 			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); | ||||
| 			if (verify_dd_livelist(os) != 0) | ||||
| 				fatal("livelist is incorrect"); | ||||
| 		} | ||||
| 
 | ||||
| 		if (dsl_dataset_remap_deadlist_exists(ds)) { | ||||
| 			(void) printf("ds_remap_deadlist:\n"); | ||||
| 			dump_deadlist(&ds->ds_remap_deadlist); | ||||
| 			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); | ||||
| 		} | ||||
| 		count_ds_mos_objects(ds); | ||||
| 	} | ||||
| @ -3470,7 +3568,7 @@ static uint64_t remap_deadlist_count = 0; | ||||
| 
 | ||||
| /*ARGSUSED*/ | ||||
| static int | ||||
| dump_one_dir(const char *dsname, void *arg) | ||||
| dump_one_objset(const char *dsname, void *arg) | ||||
| { | ||||
| 	int error; | ||||
| 	objset_t *os; | ||||
| @ -3502,7 +3600,12 @@ dump_one_dir(const char *dsname, void *arg) | ||||
| 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; | ||||
| 	} | ||||
| 
 | ||||
| 	dump_dir(os); | ||||
| 	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && | ||||
| 	    !dmu_objset_is_snapshot(os)) { | ||||
| 		global_feature_count[SPA_FEATURE_LIVELIST]++; | ||||
| 	} | ||||
| 
 | ||||
| 	dump_objset(os); | ||||
| 	close_objset(os, FTAG); | ||||
| 	fuid_table_destroy(); | ||||
| 	return (0); | ||||
| @ -3993,13 +4096,15 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) | ||||
| 
 | ||||
| /* ARGSUSED */ | ||||
| static int | ||||
| increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	zdb_cb_t *zcb = arg; | ||||
| 	spa_t *spa = zcb->zcb_spa; | ||||
| 	vdev_t *vd; | ||||
| 	const dva_t *dva = &bp->blk_dva[0]; | ||||
| 
 | ||||
| 	ASSERT(!bp_freed); | ||||
| 	ASSERT(!dump_opt['L']); | ||||
| 	ASSERT3U(BP_GET_NDVAS(bp), ==, 1); | ||||
| 
 | ||||
| @ -4617,6 +4722,101 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Iterate over livelists which have been destroyed by the user but | ||||
|  * are still present in the MOS, waiting to be freed | ||||
|  */ | ||||
| typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); | ||||
| 
 | ||||
| static void | ||||
| iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) | ||||
| { | ||||
| 	objset_t *mos = spa->spa_meta_objset; | ||||
| 	uint64_t zap_obj; | ||||
| 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, | ||||
| 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); | ||||
| 	if (err == ENOENT) | ||||
| 		return; | ||||
| 	ASSERT0(err); | ||||
| 
 | ||||
| 	zap_cursor_t zc; | ||||
| 	zap_attribute_t attr; | ||||
| 	dsl_deadlist_t ll; | ||||
| 	/* NULL out os prior to dsl_deadlist_open in case it's garbage */ | ||||
| 	ll.dl_os = NULL; | ||||
| 	for (zap_cursor_init(&zc, mos, zap_obj); | ||||
| 	    zap_cursor_retrieve(&zc, &attr) == 0; | ||||
| 	    (void) zap_cursor_advance(&zc)) { | ||||
| 		dsl_deadlist_open(&ll, mos, attr.za_first_integer); | ||||
| 		func(&ll, arg); | ||||
| 		dsl_deadlist_close(&ll); | ||||
| 	} | ||||
| 	zap_cursor_fini(&zc); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT(!bp_freed); | ||||
| 	return (count_block_cb(arg, bp, tx)); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) | ||||
| { | ||||
| 	zdb_cb_t *zbc = args; | ||||
| 	bplist_t blks; | ||||
| 	bplist_create(&blks); | ||||
| 	/* determine which blocks have been alloc'd but not freed */ | ||||
| 	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); | ||||
| 	/* count those blocks */ | ||||
| 	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL); | ||||
| 	bplist_destroy(&blks); | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| livelist_count_blocks(dsl_deadlist_t *ll, void *arg) | ||||
| { | ||||
| 	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Count the blocks in the livelists that have been destroyed by the user | ||||
|  * but haven't yet been freed. | ||||
|  */ | ||||
| static void | ||||
| deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) | ||||
| { | ||||
| 	iterate_deleted_livelists(spa, livelist_count_blocks, zbc); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dump_livelist_cb(dsl_deadlist_t *ll, void *arg) | ||||
| { | ||||
| 	ASSERT3P(arg, ==, NULL); | ||||
| 	global_feature_count[SPA_FEATURE_LIVELIST]++; | ||||
| 	dump_blkptr_list(ll, "Deleted Livelist"); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Print out, register object references to, and increment feature counts for | ||||
|  * livelists that have been destroyed by the user but haven't yet been freed. | ||||
|  */ | ||||
| static void | ||||
| deleted_livelists_dump_mos(spa_t *spa) | ||||
| { | ||||
| 	uint64_t zap_obj; | ||||
| 	objset_t *mos = spa->spa_meta_objset; | ||||
| 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, | ||||
| 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); | ||||
| 	if (err == ENOENT) | ||||
| 		return; | ||||
| 	mos_obj_refd(zap_obj); | ||||
| 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| dump_block_stats(spa_t *spa) | ||||
| { | ||||
| @ -4656,11 +4856,11 @@ dump_block_stats(spa_t *spa) | ||||
| 	 * If there's a deferred-free bplist, process that first. | ||||
| 	 */ | ||||
| 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, | ||||
| 	    count_block_cb, &zcb, NULL); | ||||
| 	    bpobj_count_block_cb, &zcb, NULL); | ||||
| 
 | ||||
| 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { | ||||
| 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, | ||||
| 		    count_block_cb, &zcb, NULL); | ||||
| 		    bpobj_count_block_cb, &zcb, NULL); | ||||
| 	} | ||||
| 
 | ||||
| 	zdb_claim_removing(spa, &zcb); | ||||
| @ -4671,6 +4871,8 @@ dump_block_stats(spa_t *spa) | ||||
| 		    &zcb, NULL)); | ||||
| 	} | ||||
| 
 | ||||
| 	deleted_livelists_count_blocks(spa, &zcb); | ||||
| 
 | ||||
| 	if (dump_opt['c'] > 1) | ||||
| 		flags |= TRAVERSE_PREFETCH_DATA; | ||||
| 
 | ||||
| @ -5706,6 +5908,7 @@ dump_mos_leaks(spa_t *spa) | ||||
| 		mos_obj_refd(vim->vim_phys->vimp_counts_object); | ||||
| 		vdev_indirect_mapping_close(vim); | ||||
| 	} | ||||
| 	deleted_livelists_dump_mos(spa); | ||||
| 
 | ||||
| 	if (dp->dp_origin_snap != NULL) { | ||||
| 		dsl_dataset_t *ds; | ||||
| @ -5715,12 +5918,12 @@ dump_mos_leaks(spa_t *spa) | ||||
| 		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, | ||||
| 		    FTAG, &ds)); | ||||
| 		count_ds_mos_objects(ds); | ||||
| 		dump_deadlist(&ds->ds_deadlist); | ||||
| 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); | ||||
| 		dsl_dataset_rele(ds, FTAG); | ||||
| 		dsl_pool_config_exit(dp, FTAG); | ||||
| 
 | ||||
| 		count_ds_mos_objects(dp->dp_origin_snap); | ||||
| 		dump_deadlist(&dp->dp_origin_snap->ds_deadlist); | ||||
| 		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); | ||||
| 	} | ||||
| 	count_dir_mos_objects(dp->dp_mos_dir); | ||||
| 	if (dp->dp_free_dir != NULL) | ||||
| @ -5885,7 +6088,7 @@ dump_zpool(spa_t *spa) | ||||
| 	if (dump_opt['d'] || dump_opt['i']) { | ||||
| 		spa_feature_t f; | ||||
| 		mos_refd_objs = range_tree_create(NULL, NULL); | ||||
| 		dump_dir(dp->dp_meta_objset); | ||||
| 		dump_objset(dp->dp_meta_objset); | ||||
| 
 | ||||
| 		if (dump_opt['d'] >= 3) { | ||||
| 			dsl_pool_t *dp = spa->spa_dsl_pool; | ||||
| @ -5915,8 +6118,9 @@ dump_zpool(spa_t *spa) | ||||
| 			global_feature_count[f] = UINT64_MAX; | ||||
| 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; | ||||
| 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; | ||||
| 		global_feature_count[SPA_FEATURE_LIVELIST] = 0; | ||||
| 
 | ||||
| 		(void) dmu_objset_find(spa_name(spa), dump_one_dir, | ||||
| 		(void) dmu_objset_find(spa_name(spa), dump_one_objset, | ||||
| 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); | ||||
| 
 | ||||
| 		if (rc == 0 && !dump_opt['L']) | ||||
| @ -6777,9 +6981,9 @@ main(int argc, char **argv) | ||||
| 			} | ||||
| 		} | ||||
| 		if (os != NULL) { | ||||
| 			dump_dir(os); | ||||
| 			dump_objset(os); | ||||
| 		} else if (zopt_objects > 0 && !dump_opt['m']) { | ||||
| 			dump_dir(spa->spa_meta_objset); | ||||
| 			dump_objset(spa->spa_meta_objset); | ||||
| 		} else { | ||||
| 			dump_zpool(spa); | ||||
| 		} | ||||
|  | ||||
| @ -20,6 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2018 by Delphix. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef	_SYS_BPLIST_H | ||||
| @ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl); | ||||
| void bplist_append(bplist_t *bpl, const blkptr_t *bp); | ||||
| void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, | ||||
|     void *arg, dmu_tx_t *tx); | ||||
| void bplist_clear(bplist_t *bpl); | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| } | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2015 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2015, 2019 by Delphix. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef	_SYS_BPOBJ_H | ||||
| @ -31,6 +31,7 @@ | ||||
| #include <sys/txg.h> | ||||
| #include <sys/zio.h> | ||||
| #include <sys/zfs_context.h> | ||||
| #include <sys/bplist.h> | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| extern "C" { | ||||
| @ -48,10 +49,12 @@ typedef struct bpobj_phys { | ||||
| 	uint64_t	bpo_uncomp; | ||||
| 	uint64_t	bpo_subobjs; | ||||
| 	uint64_t	bpo_num_subobjs; | ||||
| 	uint64_t	bpo_num_freed; | ||||
| } bpobj_phys_t; | ||||
| 
 | ||||
| #define	BPOBJ_SIZE_V0	(2 * sizeof (uint64_t)) | ||||
| #define	BPOBJ_SIZE_V1	(4 * sizeof (uint64_t)) | ||||
| #define	BPOBJ_SIZE_V2	(6 * sizeof (uint64_t)) | ||||
| 
 | ||||
| typedef struct bpobj { | ||||
| 	kmutex_t	bpo_lock; | ||||
| @ -60,12 +63,14 @@ typedef struct bpobj { | ||||
| 	int		bpo_epb; | ||||
| 	uint8_t		bpo_havecomp; | ||||
| 	uint8_t		bpo_havesubobj; | ||||
| 	uint8_t		bpo_havefreed; | ||||
| 	bpobj_phys_t	*bpo_phys; | ||||
| 	dmu_buf_t	*bpo_dbuf; | ||||
| 	dmu_buf_t	*bpo_cached_dbuf; | ||||
| } bpobj_t; | ||||
| 
 | ||||
| typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx); | ||||
| 
 | ||||
| uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); | ||||
| uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); | ||||
| @ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo); | ||||
| boolean_t bpobj_is_open(const bpobj_t *bpo); | ||||
| 
 | ||||
| int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); | ||||
| int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); | ||||
| int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *); | ||||
| int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, | ||||
|     void *arg, int64_t start); | ||||
| 
 | ||||
| void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); | ||||
| void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx); | ||||
| 
 | ||||
| int bpobj_space(bpobj_t *bpo, | ||||
|     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); | ||||
| @ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, | ||||
|     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); | ||||
| boolean_t bpobj_is_empty(bpobj_t *bpo); | ||||
| 
 | ||||
| int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx); | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| } | ||||
| #endif | ||||
|  | ||||
| @ -383,6 +383,7 @@ typedef struct dmu_buf { | ||||
| #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect" | ||||
| #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint" | ||||
| #define	DMU_POOL_LOG_SPACEMAP_ZAP	"com.delphix:log_spacemap_zap" | ||||
| #define	DMU_POOL_DELETED_CLONES		"com.delphix:deleted_clones" | ||||
| 
 | ||||
| /*
 | ||||
|  * Allocate an object from this objset.  The range of object numbers | ||||
| @ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os); | ||||
| extern uint64_t dmu_objset_dnodesize(objset_t *os); | ||||
| extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); | ||||
| extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); | ||||
| extern int dmu_objset_blksize(objset_t *os); | ||||
| extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, | ||||
|     uint64_t *id, uint64_t *offp, boolean_t *case_conflict); | ||||
| extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); | ||||
|  | ||||
| @ -126,7 +126,7 @@ struct objset { | ||||
| 	zfs_cache_type_t os_secondary_cache; | ||||
| 	zfs_sync_type_t os_sync; | ||||
| 	zfs_redundant_metadata_type_t os_redundant_metadata; | ||||
| 	int os_recordsize; | ||||
| 	uint64_t os_recordsize; | ||||
| 	/*
 | ||||
| 	 * The next four values are used as a cache of whatever's on disk, and | ||||
| 	 * are initialized the first time these properties are queried. Before | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2015 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2018, 2019 by Delphix. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef	_SYS_DSL_DEADLIST_H | ||||
| @ -28,12 +28,14 @@ | ||||
| 
 | ||||
| #include <sys/bpobj.h> | ||||
| #include <sys/zfs_context.h> | ||||
| #include <sys/zthr.h> | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| 
 | ||||
| struct dmu_buf; | ||||
| struct dsl_pool; | ||||
| struct dsl_dataset; | ||||
| 
 | ||||
| typedef struct dsl_deadlist_phys { | ||||
| @ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry { | ||||
| 	bpobj_t dle_bpobj; | ||||
| } dsl_deadlist_entry_t; | ||||
| 
 | ||||
| typedef struct livelist_condense_entry { | ||||
| 	struct dsl_dataset *ds; | ||||
| 	dsl_deadlist_entry_t *first; | ||||
| 	dsl_deadlist_entry_t *next; | ||||
| 	boolean_t syncing; | ||||
| 	boolean_t cancelled; | ||||
| } livelist_condense_entry_t; | ||||
| 
 | ||||
| extern unsigned long zfs_livelist_max_entries; | ||||
| extern int zfs_livelist_min_percent_shared; | ||||
| 
 | ||||
| typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); | ||||
| 
 | ||||
| void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); | ||||
| void dsl_deadlist_close(dsl_deadlist_t *dl); | ||||
| void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); | ||||
| uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); | ||||
| void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); | ||||
| void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, | ||||
|     boolean_t free, dmu_tx_t *tx); | ||||
| int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); | ||||
| void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); | ||||
| void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, | ||||
| dmu_tx_t *tx); | ||||
| dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl); | ||||
| dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl); | ||||
| uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, | ||||
|     uint64_t mrs_obj, dmu_tx_t *tx); | ||||
| void dsl_deadlist_space(dsl_deadlist_t *dl, | ||||
| @ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); | ||||
| void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, | ||||
|     dmu_tx_t *tx); | ||||
| boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); | ||||
| int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free, | ||||
|     zthr_t *t, uint64_t *size); | ||||
| void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, | ||||
|     dmu_tx_t *tx); | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| } | ||||
|  | ||||
| @ -33,6 +33,7 @@ extern "C" { | ||||
| 
 | ||||
| struct nvlist; | ||||
| struct dsl_dataset; | ||||
| struct dsl_pool; | ||||
| struct dmu_tx; | ||||
| 
 | ||||
| int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2014, Joyent, Inc. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| @ -29,18 +29,20 @@ | ||||
| #define	_SYS_DSL_DIR_H | ||||
| 
 | ||||
| #include <sys/dmu.h> | ||||
| #include <sys/dsl_deadlist.h> | ||||
| #include <sys/dsl_pool.h> | ||||
| #include <sys/dsl_synctask.h> | ||||
| #include <sys/refcount.h> | ||||
| #include <sys/zfs_context.h> | ||||
| #include <sys/dsl_crypt.h> | ||||
| #include <sys/bplist.h> | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| 
 | ||||
| struct dsl_dataset; | ||||
| 
 | ||||
| struct zthr; | ||||
| /*
 | ||||
|  * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. | ||||
|  * They should be of the format <reverse-dns>:<field>. | ||||
| @ -49,6 +51,7 @@ struct dsl_dataset; | ||||
| #define	DD_FIELD_FILESYSTEM_COUNT	"com.joyent:filesystem_count" | ||||
| #define	DD_FIELD_SNAPSHOT_COUNT		"com.joyent:snapshot_count" | ||||
| #define	DD_FIELD_CRYPTO_KEY_OBJ		"com.datto:crypto_key_obj" | ||||
| #define	DD_FIELD_LIVELIST		"com.delphix:livelist" | ||||
| 
 | ||||
| typedef enum dd_used { | ||||
| 	DD_USED_HEAD, | ||||
| @ -114,6 +117,10 @@ struct dsl_dir { | ||||
| 	/* amount of space we expect to write; == amount of dirty data */ | ||||
| 	int64_t dd_space_towrite[TXG_SIZE]; | ||||
| 
 | ||||
| 	dsl_deadlist_t dd_livelist; | ||||
| 	bplist_t dd_pending_frees; | ||||
| 	bplist_t dd_pending_allocs; | ||||
| 
 | ||||
| 	/* protected by dd_lock; keep at end of struct for better locality */ | ||||
| 	char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; | ||||
| }; | ||||
| @ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, | ||||
|     dmu_tx_t *tx); | ||||
| void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); | ||||
| boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); | ||||
| void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); | ||||
| void dsl_dir_livelist_close(dsl_dir_t *dd); | ||||
| void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); | ||||
| 
 | ||||
| /* internal reserved dir name */ | ||||
| #define	MOS_DIR_NAME "$MOS" | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2013, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013, 2018 by Delphix. All rights reserved. | ||||
|  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| @ -54,6 +54,7 @@ struct dsl_pool; | ||||
| struct dmu_tx; | ||||
| struct dsl_scan; | ||||
| struct dsl_crypto_params; | ||||
| struct dsl_deadlist; | ||||
| 
 | ||||
| extern unsigned long zfs_dirty_data_max; | ||||
| extern unsigned long zfs_dirty_data_max_max; | ||||
|  | ||||
| @ -63,6 +63,8 @@ typedef struct ddt ddt_t; | ||||
| typedef struct ddt_entry ddt_entry_t; | ||||
| typedef struct zbookmark_phys zbookmark_phys_t; | ||||
| 
 | ||||
| struct bpobj; | ||||
| struct bplist; | ||||
| struct dsl_pool; | ||||
| struct dsl_dataset; | ||||
| struct dsl_crypto_params; | ||||
| @ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0) | ||||
| #define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1) | ||||
| #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x) | ||||
| 
 | ||||
| #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1) | ||||
| #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x) | ||||
| 
 | ||||
| #define	BP_PHYSICAL_BIRTH(bp)		\ | ||||
| 	(BP_IS_EMBEDDED(bp) ? 0 : \ | ||||
| 	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) | ||||
| @ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0) | ||||
|  * 'func' is either snprintf() or mdb_snprintf(). | ||||
|  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. | ||||
|  */ | ||||
| 
 | ||||
| #define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ | ||||
| {									\ | ||||
| 	static const char *copyname[] =					\ | ||||
| @ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool); | ||||
| extern void spa_inject_delref(spa_t *spa); | ||||
| extern void spa_scan_stat_init(spa_t *spa); | ||||
| extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); | ||||
| extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); | ||||
| 
 | ||||
| #define	SPA_ASYNC_CONFIG_UPDATE			0x01 | ||||
| #define	SPA_ASYNC_REMOVE			0x02 | ||||
| @ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa); | ||||
| extern boolean_t spa_multihost(spa_t *spa); | ||||
| extern unsigned long spa_get_hostid(void); | ||||
| extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); | ||||
| extern boolean_t spa_livelist_delete_check(spa_t *spa); | ||||
| 
 | ||||
| extern int spa_mode(spa_t *spa); | ||||
| extern uint64_t zfs_strtonum(const char *str, char **nptr); | ||||
|  | ||||
| @ -49,6 +49,7 @@ | ||||
| #include <sys/dsl_crypt.h> | ||||
| #include <sys/zfeature.h> | ||||
| #include <sys/zthr.h> | ||||
| #include <sys/dsl_deadlist.h> | ||||
| #include <zfeature_common.h> | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| @ -317,6 +318,11 @@ struct spa { | ||||
| 	list_t		spa_log_summary; | ||||
| 	uint64_t	spa_log_flushall_txg; | ||||
| 
 | ||||
| 	zthr_t		*spa_livelist_delete_zthr; /* deleting livelists */ | ||||
| 	zthr_t		*spa_livelist_condense_zthr; /* condensing livelists */ | ||||
| 	uint64_t	spa_livelists_to_delete; /* set of livelists to free */ | ||||
| 	livelist_condense_entry_t	spa_to_condense; /* next to condense */ | ||||
| 
 | ||||
| 	char		*spa_root;		/* alternate root directory */ | ||||
| 	uint64_t	spa_ena;		/* spa-wide ereport ENA */ | ||||
| 	int		spa_last_open_failed;	/* error if last open failed */ | ||||
|  | ||||
| @ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t); | ||||
| extern void zthr_wakeup(zthr_t *t); | ||||
| extern void zthr_cancel(zthr_t *t); | ||||
| extern void zthr_resume(zthr_t *t); | ||||
| extern void zthr_wait_cycle_done(zthr_t *t); | ||||
| 
 | ||||
| extern boolean_t zthr_iscancelled(zthr_t *t); | ||||
| extern boolean_t zthr_has_waiters(zthr_t *t); | ||||
| 
 | ||||
| #endif /* _SYS_ZTHR_H */ | ||||
|  | ||||
| @ -71,6 +71,7 @@ typedef enum spa_feature { | ||||
| 	SPA_FEATURE_REDACTED_DATASETS, | ||||
| 	SPA_FEATURE_BOOKMARK_WRITTEN, | ||||
| 	SPA_FEATURE_LOG_SPACEMAP, | ||||
| 	SPA_FEATURE_LIVELIST, | ||||
| 	SPA_FEATURES | ||||
| } spa_feature_t; | ||||
| 
 | ||||
|  | ||||
| @ -1909,6 +1909,98 @@ Pattern written to vdev free space by \fBzpool initialize\fR. | ||||
| Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee). | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_max_entries\fR (ulong) | ||||
| .ad | ||||
| .RS 12n | ||||
| The threshold size (in block pointers) at which we create a new sub-livelist. | ||||
| Larger sublists are more costly from a memory perspective but the fewer | ||||
| sublists there are, the lower the cost of insertion. | ||||
| .sp | ||||
| Default value: \fB500,000\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_min_percent_shared\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| If the amount of shared space between a snapshot and its clone drops below | ||||
| this threshold, the clone turns off the livelist and reverts to the old deletion | ||||
| method. This is in place because once a clone has been overwritten enough | ||||
| livelists no long give us a benefit. | ||||
| .sp | ||||
| Default value: \fB75\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_condense_new_alloc\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| Incremented each time an extra ALLOC blkptr is added to a livelist entry while | ||||
| it is being condensed. | ||||
| This option is used by the test suite to track race conditions. | ||||
| .sp | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_condense_sync_cancel\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| Incremented each time livelist condensing is canceled while in | ||||
| spa_livelist_condense_sync. | ||||
| This option is used by the test suite to track race conditions. | ||||
| .sp | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_condense_sync_pause\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| When set, the livelist condense process pauses indefinitely before | ||||
| executing the synctask - spa_livelist_condense_sync. | ||||
| This option is used by the test suite to trigger race conditions. | ||||
| .sp | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_condense_zthr_cancel\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| Incremented each time livelist condensing is canceled while in | ||||
| spa_livelist_condense_cb. | ||||
| This option is used by the test suite to track race conditions. | ||||
| .sp | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_livelist_condense_zthr_pause\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| When set, the livelist condense process pauses indefinitely before | ||||
| executing the open context condensing work in spa_livelist_condense_cb. | ||||
| This option is used by the test suite to trigger race conditions. | ||||
| .sp | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
|  | ||||
| @ -547,6 +547,26 @@ allow more data to be stored in the bonus buffer, thus potentially | ||||
| improving performance by avoiding the use of spill blocks. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fB\fBlivelist\fR\fR | ||||
| .ad | ||||
| .RS 4n | ||||
| .TS | ||||
| l l . | ||||
| GUID	com.delphix:livelist | ||||
| READ\-ONLY COMPATIBLE	yes | ||||
| DEPENDENCIES	none | ||||
| .TE | ||||
| This feature allows clones to be deleted faster than the traditional method | ||||
| when a large number of random/sparse writes have been made to the clone. | ||||
| All blocks allocated and freed after a clone is created are tracked by the | ||||
| the clone's livelist which is referenced during the deletion of the clone. | ||||
| The feature is activated when a clone is created and remains active until all | ||||
| clones have been destroyed. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| @ -882,7 +902,6 @@ This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand | ||||
| is used to checkpoint the pool. | ||||
| The feature will only return back to being \fBenabled\fR when the pool | ||||
| is rewound or the checkpoint has been discarded. | ||||
| .RE | ||||
| 
 | ||||
| .SH "SEE ALSO" | ||||
| zpool(8) | ||||
|  | ||||
| @ -348,6 +348,18 @@ zpool_feature_init(void) | ||||
| 	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, | ||||
| 	    ZFEATURE_TYPE_BOOLEAN, NULL); | ||||
| 
 | ||||
| 	{ | ||||
| 	static const spa_feature_t livelist_deps[] = { | ||||
| 		SPA_FEATURE_EXTENSIBLE_DATASET, | ||||
| 		SPA_FEATURE_NONE | ||||
| 	}; | ||||
| 	zfeature_register(SPA_FEATURE_LIVELIST, | ||||
| 	    "com.delphix:livelist", "livelist", | ||||
| 	    "Improved clone deletion performance.", | ||||
| 	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, | ||||
| 	    livelist_deps); | ||||
| 	} | ||||
| 
 | ||||
| 	{ | ||||
| 	static const spa_feature_t log_spacemap_deps[] = { | ||||
| 		SPA_FEATURE_SPACEMAP_V2, | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #include <sys/bplist.h> | ||||
| @ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) | ||||
| 	} | ||||
| 	mutex_exit(&bpl->bpl_lock); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| bplist_clear(bplist_t *bpl) | ||||
| { | ||||
| 	bplist_entry_t *bpe; | ||||
| 
 | ||||
| 	mutex_enter(&bpl->bpl_lock); | ||||
| 	while ((bpe = list_head(&bpl->bpl_list))) { | ||||
| 		bplist_iterate_last_removed = bpe; | ||||
| 		list_remove(&bpl->bpl_list, bpe); | ||||
| 		kmem_free(bpe, sizeof (*bpe)); | ||||
| 	} | ||||
| 	mutex_exit(&bpl->bpl_lock); | ||||
| } | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2011, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2011, 2018 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2017 Datto Inc. | ||||
|  */ | ||||
| 
 | ||||
| @ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) | ||||
| 		size = BPOBJ_SIZE_V0; | ||||
| 	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) | ||||
| 		size = BPOBJ_SIZE_V1; | ||||
| 	else if (!spa_feature_is_active(dmu_objset_spa(os), | ||||
| 	    SPA_FEATURE_LIVELIST)) | ||||
| 		size = BPOBJ_SIZE_V2; | ||||
| 	else | ||||
| 		size = sizeof (bpobj_phys_t); | ||||
| 
 | ||||
| @ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) | ||||
| 	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; | ||||
| 	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); | ||||
| 	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); | ||||
| 	bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); | ||||
| 	bpo->bpo_phys = bpo->bpo_dbuf->db_data; | ||||
| 	return (0); | ||||
| } | ||||
| @ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) | ||||
|  * Update bpobj and all of its parents with new space accounting. | ||||
|  */ | ||||
| static void | ||||
| propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, | ||||
|     uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx) | ||||
| propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, | ||||
|     int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) | ||||
| { | ||||
| 
 | ||||
| 	for (; bpi != NULL; bpi = bpi->bpi_parent) { | ||||
| @ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, | ||||
| 
 | ||||
| static int | ||||
| bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, | ||||
|     dmu_tx_t *tx, boolean_t free) | ||||
|     int64_t start, dmu_tx_t *tx, boolean_t free) | ||||
| { | ||||
| 	int err = 0; | ||||
| 	uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0; | ||||
| 	int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; | ||||
| 	dmu_buf_t *dbuf = NULL; | ||||
| 	bpobj_t *bpo = bpi->bpi_bpo; | ||||
| 
 | ||||
| 	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { | ||||
| 	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { | ||||
| 		uint64_t offset = i * sizeof (blkptr_t); | ||||
| 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); | ||||
| 
 | ||||
| 		if (dbuf == NULL || dbuf->db_offset > offset) { | ||||
| 			if (dbuf) | ||||
| 				dmu_buf_rele(dbuf, FTAG); | ||||
| 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, | ||||
| 			    FTAG, &dbuf, 0); | ||||
| 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, | ||||
| 			    offset, FTAG, &dbuf, 0); | ||||
| 			if (err) | ||||
| 				break; | ||||
| 		} | ||||
| @ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, | ||||
| 
 | ||||
| 		blkptr_t *bparray = dbuf->db_data; | ||||
| 		blkptr_t *bp = &bparray[blkoff]; | ||||
| 		err = func(arg, bp, tx); | ||||
| 
 | ||||
| 		boolean_t bp_freed = BP_GET_FREE(bp); | ||||
| 		err = func(arg, bp, bp_freed, tx); | ||||
| 		if (err) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (free) { | ||||
| 			int sign = bp_freed ? -1 : +1; | ||||
| 			spa_t *spa = dmu_objset_spa(bpo->bpo_os); | ||||
| 			freed += bp_get_dsize_sync(spa, bp); | ||||
| 			comp_freed += BP_GET_PSIZE(bp); | ||||
| 			uncomp_freed += BP_GET_UCSIZE(bp); | ||||
| 			freed += sign * bp_get_dsize_sync(spa, bp); | ||||
| 			comp_freed += sign * BP_GET_PSIZE(bp); | ||||
| 			uncomp_freed += sign * BP_GET_UCSIZE(bp); | ||||
| 			ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); | ||||
| 			bpo->bpo_phys->bpo_num_blkptrs--; | ||||
| 			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); | ||||
| 			if (bp_freed) { | ||||
| 				ASSERT(bpo->bpo_havefreed); | ||||
| 				bpo->bpo_phys->bpo_num_freed--; | ||||
| 				ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	if (free) { | ||||
| @ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, | ||||
|  */ | ||||
| static int | ||||
| bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, | ||||
|     dmu_tx_t *tx, boolean_t free) | ||||
|     dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) | ||||
| { | ||||
| 	list_t stack; | ||||
| 	bpobj_info_t *bpi; | ||||
| @ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, | ||||
| 	list_create(&stack, sizeof (bpobj_info_t), | ||||
| 	    offsetof(bpobj_info_t, bpi_node)); | ||||
| 	mutex_enter(&initial_bpo->bpo_lock); | ||||
| 
 | ||||
| 	if (bpobj_size != NULL) | ||||
| 		*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; | ||||
| 
 | ||||
| 	list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); | ||||
| 
 | ||||
| 	while ((bpi = list_head(&stack)) != NULL) { | ||||
| @ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, | ||||
| 			dmu_buf_will_dirty(bpo->bpo_dbuf, tx); | ||||
| 
 | ||||
| 		if (bpi->bpi_visited == B_FALSE) { | ||||
| 			err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free); | ||||
| 			err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, | ||||
| 			    free); | ||||
| 			bpi->bpi_visited = B_TRUE; | ||||
| 			if (err != 0) | ||||
| 				break; | ||||
| @ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, | ||||
| 			 * We have unprocessed subobjs. Process the next one. | ||||
| 			 */ | ||||
| 			ASSERT(bpo->bpo_havecomp); | ||||
| 			ASSERT3P(bpobj_size, ==, NULL); | ||||
| 
 | ||||
| 			/* Add the last subobj to stack. */ | ||||
| 			int64_t i = bpi->bpi_unprocessed_subobjs - 1; | ||||
| @ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, | ||||
| int | ||||
| bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) | ||||
| { | ||||
| 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); | ||||
| 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Iterate the entries.  If func returns nonzero, iteration will stop. | ||||
|  * | ||||
|  * If there are no subobjs: | ||||
|  * | ||||
|  * *bpobj_size can be used to return the number of block pointers in the | ||||
|  * bpobj.  Note that this may be different from the number of block pointers | ||||
|  * that are iterated over, if iteration is terminated early (e.g. by the func | ||||
|  * returning nonzero). | ||||
|  * | ||||
|  * If there are concurrent (or subsequent) modifications to the bpobj then the | ||||
|  * returned *bpobj_size can be passed as "start" to | ||||
|  * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. | ||||
|  */ | ||||
| int | ||||
| bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) | ||||
| bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, | ||||
|     uint64_t *bpobj_size) | ||||
| { | ||||
| 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); | ||||
| 	return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Iterate over the blkptrs in the bpobj beginning at index start. If func | ||||
|  * returns nonzero, iteration will stop. This is a livelist specific function | ||||
|  * since it assumes that there are no subobjs present. | ||||
|  */ | ||||
| int | ||||
| livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, | ||||
|     int64_t start) | ||||
| { | ||||
| 	if (bpo->bpo_havesubobj) | ||||
| 		VERIFY0(bpo->bpo_phys->bpo_subobjs); | ||||
| 	bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); | ||||
| 	int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); | ||||
| 	kmem_free(bpi, sizeof (bpobj_info_t)); | ||||
| 	return (err); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) | ||||
| } | ||||
| 
 | ||||
| void | ||||
| bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	blkptr_t stored_bp = *bp; | ||||
| 	uint64_t offset; | ||||
| @ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); | ||||
| 	} | ||||
| 
 | ||||
| 	/* We never need the fill count. */ | ||||
| 	stored_bp.blk_fill = 0; | ||||
| 	BP_SET_FREE(&stored_bp, bp_freed); | ||||
| 
 | ||||
| 	mutex_enter(&bpo->bpo_lock); | ||||
| 
 | ||||
| @ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 
 | ||||
| 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx); | ||||
| 	bpo->bpo_phys->bpo_num_blkptrs++; | ||||
| 	bpo->bpo_phys->bpo_bytes += | ||||
| 	int sign = bp_freed ? -1 : +1; | ||||
| 	bpo->bpo_phys->bpo_bytes += sign * | ||||
| 	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); | ||||
| 	if (bpo->bpo_havecomp) { | ||||
| 		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); | ||||
| 		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); | ||||
| 		bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); | ||||
| 		bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); | ||||
| 	} | ||||
| 	if (bp_freed) { | ||||
| 		ASSERT(bpo->bpo_havefreed); | ||||
| 		bpo->bpo_phys->bpo_num_freed++; | ||||
| 	} | ||||
| 	mutex_exit(&bpo->bpo_lock); | ||||
| } | ||||
| @ -799,7 +852,7 @@ struct space_range_arg { | ||||
| 
 | ||||
| /* ARGSUSED */ | ||||
| static int | ||||
| space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) | ||||
| { | ||||
| 	struct space_range_arg *sra = arg; | ||||
| 
 | ||||
| @ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, | ||||
| 	*uncompp = sra.uncomp; | ||||
| 	return (err); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a | ||||
|  * bpobj are designated as free or allocated that information is not preserved | ||||
|  * in bplists. | ||||
|  */ | ||||
| /* ARGSUSED */ | ||||
| int | ||||
| bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	bplist_t *bpl = arg; | ||||
| 	bplist_append(bpl, bp); | ||||
| 	return (0); | ||||
| } | ||||
|  | ||||
| @ -3286,6 +3286,13 @@ dbuf_hold_impl_arg(struct dbuf_hold_arg *dh) | ||||
| 
 | ||||
| 	*(dh->dh_dbp) = NULL; | ||||
| 
 | ||||
| 	/* If the pool has been created, verify the tx_sync_lock is not held */ | ||||
| 	spa_t *spa = dh->dh_dn->dn_objset->os_spa; | ||||
| 	dsl_pool_t *dp = spa->spa_dsl_pool; | ||||
| 	if (dp != NULL) { | ||||
| 		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); | ||||
| 	} | ||||
| 
 | ||||
| 	/* dbuf_find() returns with db_mtx held */ | ||||
| 	dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, | ||||
| 	    dh->dh_level, dh->dh_blkid); | ||||
| @ -4479,6 +4486,29 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) | ||||
| 	drica.drica_tx = tx; | ||||
| 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, | ||||
| 	    &drica)) { | ||||
| 		/*
 | ||||
| 		 * If the blkptr being remapped is tracked by a livelist, | ||||
| 		 * then we need to make sure the livelist reflects the update. | ||||
| 		 * First, cancel out the old blkptr by appending a 'FREE' | ||||
| 		 * entry. Next, add an 'ALLOC' to track the new version. This | ||||
| 		 * way we avoid trying to free an inaccurate blkptr at delete. | ||||
| 		 * Note that embedded blkptrs are not tracked in livelists. | ||||
| 		 */ | ||||
| 		if (dn->dn_objset != spa_meta_objset(spa)) { | ||||
| 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); | ||||
| 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && | ||||
| 			    bp->blk_birth > ds->ds_dir->dd_origin_txg) { | ||||
| 				ASSERT(!BP_IS_EMBEDDED(bp)); | ||||
| 				ASSERT(dsl_dir_is_clone(ds->ds_dir)); | ||||
| 				ASSERT(spa_feature_is_enabled(spa, | ||||
| 				    SPA_FEATURE_LIVELIST)); | ||||
| 				bplist_append(&ds->ds_dir->dd_pending_frees, | ||||
| 				    bp); | ||||
| 				bplist_append(&ds->ds_dir->dd_pending_allocs, | ||||
| 				    &bp_copy); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * The db_rwlock prevents dbuf_read_impl() from | ||||
| 		 * dereferencing the BP while we are changing it.  To | ||||
|  | ||||
| @ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) | ||||
| void | ||||
| dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| 	int used, compressed, uncompressed; | ||||
| 	spa_t *spa = dmu_tx_pool(tx)->dp_spa; | ||||
| 	int used = bp_get_dsize_sync(spa, bp); | ||||
| 	int compressed = BP_GET_PSIZE(bp); | ||||
| 	int uncompressed = BP_GET_UCSIZE(bp); | ||||
| 	int64_t delta; | ||||
| 
 | ||||
| 	used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); | ||||
| 	compressed = BP_GET_PSIZE(bp); | ||||
| 	uncompressed = BP_GET_UCSIZE(bp); | ||||
| 
 | ||||
| 	dprintf_bp(bp, "ds=%p", ds); | ||||
| 
 | ||||
| 	ASSERT(dmu_tx_is_syncing(tx)); | ||||
| @ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 		ds->ds_feature_activation[f] = (void *)B_TRUE; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Track block for livelist, but ignore embedded blocks because | ||||
| 	 * they do not need to be freed. | ||||
| 	 */ | ||||
| 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && | ||||
| 	    bp->blk_birth > ds->ds_dir->dd_origin_txg && | ||||
| 	    !(BP_IS_EMBEDDED(bp))) { | ||||
| 		ASSERT(dsl_dir_is_clone(ds->ds_dir)); | ||||
| 		ASSERT(spa_feature_is_enabled(spa, | ||||
| 		    SPA_FEATURE_LIVELIST)); | ||||
| 		bplist_append(&ds->ds_dir->dd_pending_allocs, bp); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_exit(&ds->ds_lock); | ||||
| 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, | ||||
| 	    compressed, uncompressed, tx); | ||||
| @ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, | ||||
| 		DVA_SET_VDEV(dva, vdev); | ||||
| 		DVA_SET_OFFSET(dva, offset); | ||||
| 		DVA_SET_ASIZE(dva, size); | ||||
| 
 | ||||
| 		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx); | ||||
| 		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, | ||||
| 		    tx); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| @ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, | ||||
| 	ASSERT(!ds->ds_is_snapshot); | ||||
| 	dmu_buf_will_dirty(ds->ds_dbuf, tx); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Track block for livelist, but ignore embedded blocks because | ||||
| 	 * they do not need to be freed. | ||||
| 	 */ | ||||
| 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && | ||||
| 	    bp->blk_birth > ds->ds_dir->dd_origin_txg && | ||||
| 	    !(BP_IS_EMBEDDED(bp))) { | ||||
| 		ASSERT(dsl_dir_is_clone(ds->ds_dir)); | ||||
| 		ASSERT(spa_feature_is_enabled(spa, | ||||
| 		    SPA_FEATURE_LIVELIST)); | ||||
| 		bplist_append(&ds->ds_dir->dd_pending_frees, bp); | ||||
| 	} | ||||
| 
 | ||||
| 	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { | ||||
| 		int64_t delta; | ||||
| 
 | ||||
| @ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, | ||||
| 			 */ | ||||
| 			bplist_append(&ds->ds_pending_deadlist, bp); | ||||
| 		} else { | ||||
| 			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); | ||||
| 			dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); | ||||
| 		} | ||||
| 		ASSERT3U(ds->ds_prev->ds_object, ==, | ||||
| 		    dsl_dataset_phys(ds)->ds_prev_snap_obj); | ||||
| @ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, | ||||
| 
 | ||||
| 	ASSERT(dmu_tx_is_syncing(tx)); | ||||
| 	ASSERT(lastname[0] != '@'); | ||||
| 	/*
 | ||||
| 	 * Filesystems will eventually have their origin set to dp_origin_snap, | ||||
| 	 * but that's taken care of in dsl_dataset_create_sync_dd. When | ||||
| 	 * creating a filesystem, this function is called with origin equal to | ||||
| 	 * NULL. | ||||
| 	 */ | ||||
| 	if (origin != NULL) | ||||
| 		ASSERT3P(origin, !=, dp->dp_origin_snap); | ||||
| 
 | ||||
| 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); | ||||
| 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); | ||||
| @ -1250,6 +1283,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, | ||||
| 
 | ||||
| 	dsl_deleg_set_create_perms(dd, tx, cr); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we are creating a clone and the livelist feature is enabled, | ||||
| 	 * add the entry DD_FIELD_LIVELIST to ZAP. | ||||
| 	 */ | ||||
| 	if (origin != NULL && | ||||
| 	    spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { | ||||
| 		objset_t *mos = dd->dd_pool->dp_meta_objset; | ||||
| 		dsl_dir_zapify(dd, tx); | ||||
| 		uint64_t obj = dsl_deadlist_alloc(mos, tx); | ||||
| 		VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, | ||||
| 		    sizeof (uint64_t), 1, &obj, tx)); | ||||
| 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Since we're creating a new node we know it's a leaf, so we can | ||||
| 	 * initialize the counts if the limit feature is active. | ||||
| @ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| /*
 | ||||
|  * Check if the percentage of blocks shared between the clone and the | ||||
|  * snapshot (as opposed to those that are clone only) is below a certain | ||||
|  * threshold | ||||
|  */ | ||||
| boolean_t | ||||
| dsl_livelist_should_disable(dsl_dataset_t *ds) | ||||
| { | ||||
| 	dsl_deadlist_t *dl = arg; | ||||
| 	dsl_deadlist_insert(dl, bp, tx); | ||||
| 	uint64_t used, referenced; | ||||
| 	int percent_shared; | ||||
| 
 | ||||
| 	used = dsl_dir_get_usedds(ds->ds_dir); | ||||
| 	referenced = dsl_get_referenced(ds); | ||||
| 	ASSERT3U(referenced, >=, 0); | ||||
| 	ASSERT3U(used, >=, 0); | ||||
| 	if (referenced == 0) | ||||
| 		return (B_FALSE); | ||||
| 	percent_shared = (100 * (referenced - used)) / referenced; | ||||
| 	if (percent_shared <= zfs_livelist_min_percent_shared) | ||||
| 		return (B_TRUE); | ||||
| 	return (B_FALSE); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  *  Check if it is possible to combine two livelist entries into one. | ||||
|  *  This is the case if the combined number of 'live' blkptrs (ALLOCs that | ||||
|  *  don't have a matching FREE) is under the maximum sublist size. | ||||
|  *  We check this by subtracting twice the total number of frees from the total | ||||
|  *  number of blkptrs. FREEs are counted twice because each FREE blkptr | ||||
|  *  will cancel out an ALLOC blkptr when the livelist is processed. | ||||
|  */ | ||||
| static boolean_t | ||||
| dsl_livelist_should_condense(dsl_deadlist_entry_t *first, | ||||
|     dsl_deadlist_entry_t *next) | ||||
| { | ||||
| 	uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + | ||||
| 	    next->dle_bpobj.bpo_phys->bpo_num_freed; | ||||
| 	uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + | ||||
| 	    next->dle_bpobj.bpo_phys->bpo_num_blkptrs; | ||||
| 	if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) | ||||
| 		return (B_TRUE); | ||||
| 	return (B_FALSE); | ||||
| } | ||||
| 
 | ||||
| typedef struct try_condense_arg { | ||||
| 	spa_t *spa; | ||||
| 	dsl_dataset_t *ds; | ||||
| } try_condense_arg_t; | ||||
| 
 | ||||
| /*
 | ||||
|  * Iterate over the livelist entries, searching for a pair to condense. | ||||
|  * A nonzero return value means stop, 0 means keep looking. | ||||
|  */ | ||||
| static int | ||||
| dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) | ||||
| { | ||||
| 	try_condense_arg_t *tca = arg; | ||||
| 	spa_t *spa = tca->spa; | ||||
| 	dsl_dataset_t *ds = tca->ds; | ||||
| 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; | ||||
| 	dsl_deadlist_entry_t *next; | ||||
| 
 | ||||
| 	/* The condense thread has not yet been created at import */ | ||||
| 	if (spa->spa_livelist_condense_zthr == NULL) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	/* A condense is already in progress */ | ||||
| 	if (spa->spa_to_condense.ds != NULL) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	next = AVL_NEXT(&ll->dl_tree, &first->dle_node); | ||||
| 	/* The livelist has only one entry - don't condense it */ | ||||
| 	if (next == NULL) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	/* Next is the newest entry - don't condense it */ | ||||
| 	if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	/* This pair is not ready to condense but keep looking */ | ||||
| 	if (!dsl_livelist_should_condense(first, next)) | ||||
| 		return (0); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Add a ref to prevent the dataset from being evicted while | ||||
| 	 * the condense zthr or synctask are running. Ref will be | ||||
| 	 * released at the end of the condense synctask | ||||
| 	 */ | ||||
| 	dmu_buf_add_ref(ds->ds_dbuf, spa); | ||||
| 
 | ||||
| 	spa->spa_to_condense.ds = ds; | ||||
| 	spa->spa_to_condense.first = first; | ||||
| 	spa->spa_to_condense.next = next; | ||||
| 	spa->spa_to_condense.syncing = B_FALSE; | ||||
| 	spa->spa_to_condense.cancelled = B_FALSE; | ||||
| 
 | ||||
| 	zthr_wakeup(spa->spa_livelist_condense_zthr); | ||||
| 	return (1); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| { | ||||
| 	dsl_dir_t *dd = ds->ds_dir; | ||||
| 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa; | ||||
| 	dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); | ||||
| 
 | ||||
| 	/* Check if we need to add a new sub-livelist */ | ||||
| 	if (last == NULL) { | ||||
| 		/* The livelist is empty */ | ||||
| 		dsl_deadlist_add_key(&dd->dd_livelist, | ||||
| 		    tx->tx_txg - 1, tx); | ||||
| 	} else if (spa_sync_pass(spa) == 1) { | ||||
| 		/*
 | ||||
| 		 * Check if the newest entry is full. If it is, make a new one. | ||||
| 		 * We only do this once per sync because we could overfill a | ||||
| 		 * sublist in one sync pass and don't want to add another entry | ||||
| 		 * for a txg that is already represented. This ensures that | ||||
| 		 * blkptrs born in the same txg are stored in the same sublist. | ||||
| 		 */ | ||||
| 		bpobj_t bpobj = last->dle_bpobj; | ||||
| 		uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; | ||||
| 		uint64_t free = bpobj.bpo_phys->bpo_num_freed; | ||||
| 		uint64_t alloc = all - free; | ||||
| 		if (alloc > zfs_livelist_max_entries) { | ||||
| 			dsl_deadlist_add_key(&dd->dd_livelist, | ||||
| 			    tx->tx_txg - 1, tx); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* Insert each entry into the on-disk livelist */ | ||||
| 	bplist_iterate(&dd->dd_pending_allocs, | ||||
| 	    dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); | ||||
| 	bplist_iterate(&dd->dd_pending_frees, | ||||
| 	    dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); | ||||
| 
 | ||||
| 	/* Attempt to condense every pair of adjacent entries */ | ||||
| 	try_condense_arg_t arg = { | ||||
| 	    .spa = spa, | ||||
| 	    .ds = ds | ||||
| 	}; | ||||
| 	dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, | ||||
| 	    &arg); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| @ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| 	objset_t *os = ds->ds_objset; | ||||
| 
 | ||||
| 	bplist_iterate(&ds->ds_pending_deadlist, | ||||
| 	    deadlist_enqueue_cb, &ds->ds_deadlist, tx); | ||||
| 	    dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); | ||||
| 
 | ||||
| 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { | ||||
| 		dsl_flush_pending_livelist(ds, tx); | ||||
| 		if (dsl_livelist_should_disable(ds)) { | ||||
| 			dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	dsl_bookmark_sync_done(ds, tx); | ||||
| 
 | ||||
| @ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) | ||||
| 	uint64_t oldnext_obj; | ||||
| 	int64_t delta; | ||||
| 
 | ||||
| 	ASSERT(nvlist_empty(ddpa->err_ds)); | ||||
| 
 | ||||
| 	VERIFY0(promote_hold(ddpa, dp, FTAG)); | ||||
| 	hds = ddpa->ddpa_clone; | ||||
| 
 | ||||
| @ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) | ||||
| 
 | ||||
| 	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Since livelists are specific to a clone's origin txg, they | ||||
| 	 * are no longer accurate. Destroy the livelist from the clone being | ||||
| 	 * promoted. If the origin dataset is a clone, destroy its livelist | ||||
| 	 * as well. | ||||
| 	 */ | ||||
| 	dsl_dir_remove_livelist(dd, tx, B_TRUE); | ||||
| 	dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE); | ||||
| 
 | ||||
| 	/* log history record */ | ||||
| 	spa_history_log_internal_ds(hds, "promote", tx, ""); | ||||
| 
 | ||||
| @ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, | ||||
| 
 | ||||
| 	dsl_scan_ds_clone_swapped(origin_head, clone, tx); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Destroy any livelists associated with the clone or the origin, | ||||
| 	 * since after the swap the corresponding livelists are no longer | ||||
| 	 * valid. | ||||
| 	 */ | ||||
| 	dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); | ||||
| 	dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); | ||||
| 
 | ||||
| 	spa_history_log_internal_ds(clone, "clone swap", tx, | ||||
| 	    "parent=%s", origin_head->ds_dir->dd_myname); | ||||
| } | ||||
|  | ||||
| @ -20,16 +20,16 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2019 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #include <sys/dsl_dataset.h> | ||||
| #include <sys/dmu.h> | ||||
| #include <sys/refcount.h> | ||||
| #include <sys/zap.h> | ||||
| #include <sys/zfs_context.h> | ||||
| #include <sys/dsl_pool.h> | ||||
| #include <sys/dsl_dataset.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Deadlist concurrency: | ||||
| @ -51,6 +51,68 @@ | ||||
|  * provides its own locking, and dl_oldfmt is immutable. | ||||
|  */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Livelist Overview | ||||
|  * ================ | ||||
|  * | ||||
|  * Livelists use the same 'deadlist_t' struct as deadlists and are also used | ||||
|  * to track blkptrs over the lifetime of a dataset. Livelists however, belong | ||||
|  * to clones and track the blkptrs that are clone-specific (were born after | ||||
|  * the clone's creation). The exception is embedded block pointers which are | ||||
|  * not included in livelists because they do not need to be freed. | ||||
|  * | ||||
|  * When it comes time to delete the clone, the livelist provides a quick | ||||
|  * reference as to what needs to be freed. For this reason, livelists also track | ||||
|  * when clone-specific blkptrs are freed before deletion to prevent double | ||||
|  * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the | ||||
|  * deletion algorithm iterates backwards over the livelist, matching | ||||
|  * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists | ||||
|  * are also updated in the case when blkptrs are remapped: the old version | ||||
|  * of the blkptr is cancelled out with a FREE and the new version is tracked | ||||
|  * with an ALLOC. | ||||
|  * | ||||
|  * To bound the amount of memory required for deletion, livelists over a | ||||
|  * certain size are spread over multiple entries. Entries are grouped by | ||||
|  * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will | ||||
|  * be in the same entry. This allows us to delete livelists incrementally | ||||
|  * over multiple syncs, one entry at a time. | ||||
|  * | ||||
|  * During the lifetime of the clone, livelists can get extremely large. | ||||
|  * Their size is managed by periodic condensing (preemptively cancelling out | ||||
|  * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when | ||||
|  * the shared space between the clone and its origin is so small that it | ||||
|  * doesn't make sense to use livelists anymore. | ||||
|  */ | ||||
| 
 | ||||
| /*
 | ||||
|  * The threshold sublist size at which we create a new sub-livelist for the | ||||
|  * next txg. However, since blkptrs of the same transaction group must be in | ||||
|  * the same sub-list, the actual sublist size may exceed this. When picking the | ||||
|  * size we had to balance the fact that larger sublists mean fewer sublists | ||||
|  * (decreasing the cost of insertion) against the consideration that sublists | ||||
|  * will be loaded into memory and shouldn't take up an inordinate amount of | ||||
|  * space. We settled on ~500000 entries, corresponding to roughly 128M. | ||||
|  */ | ||||
| unsigned long zfs_livelist_max_entries = 500000; | ||||
| 
 | ||||
| /*
 | ||||
|  * We can approximate how much of a performance gain a livelist will give us | ||||
|  * based on the percentage of blocks shared between the clone and its origin. | ||||
|  * 0 percent shared means that the clone has completely diverged and that the | ||||
|  * old method is maximally effective: every read from the block tree will | ||||
|  * result in lots of frees. Livelists give us gains when they track blocks | ||||
|  * scattered across the tree, when one read in the old method might only | ||||
|  * result in a few frees. Once the clone has been overwritten enough, | ||||
|  * writes are no longer sparse and we'll no longer get much of a benefit from | ||||
|  * tracking them with a livelist. We chose a lower limit of 75 percent shared | ||||
|  * (25 percent overwritten). This means that 1/4 of all block pointers will be | ||||
|  * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists | ||||
|  * to make deletion 4x faster. Once the amount of shared space drops below this | ||||
|  * threshold, the clone will revert to the old deletion method. | ||||
|  */ | ||||
| int zfs_livelist_min_percent_shared = 75; | ||||
| 
 | ||||
| 
 | ||||
| static int | ||||
| dsl_deadlist_compare(const void *arg1, const void *arg2) | ||||
| { | ||||
| @ -88,6 +150,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) | ||||
| 	dl->dl_havetree = B_TRUE; | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) | ||||
| { | ||||
| 	dsl_deadlist_entry_t *dle; | ||||
| 
 | ||||
| 	ASSERT(dsl_deadlist_is_open(dl)); | ||||
| 
 | ||||
| 	mutex_enter(&dl->dl_lock); | ||||
| 	dsl_deadlist_load_tree(dl); | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| 	for (dle = avl_first(&dl->dl_tree); dle != NULL; | ||||
| 	    dle = AVL_NEXT(&dl->dl_tree, dle)) { | ||||
| 		if (func(args, dle) != 0) | ||||
| 			break; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) | ||||
| { | ||||
| @ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) | ||||
| 
 | ||||
| static void | ||||
| dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | ||||
|     const blkptr_t *bp, dmu_tx_t *tx) | ||||
|     const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&dl->dl_lock)); | ||||
| 	if (dle->dle_bpobj.bpo_object == | ||||
| @ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | ||||
| 		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, | ||||
| 		    dle->dle_mintxg, obj, tx)); | ||||
| 	} | ||||
| 	bpobj_enqueue(&dle->dle_bpobj, bp, tx); | ||||
| 	bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| @ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	dsl_deadlist_entry_t dle_tofind; | ||||
| 	dsl_deadlist_entry_t *dle; | ||||
| 	avl_index_t where; | ||||
| 
 | ||||
| 	if (dl->dl_oldfmt) { | ||||
| 		bpobj_enqueue(&dl->dl_bpobj, bp, tx); | ||||
| 		bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| @ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 	dsl_deadlist_load_tree(dl); | ||||
| 
 | ||||
| 	dmu_buf_will_dirty(dl->dl_dbuf, tx); | ||||
| 
 | ||||
| 	int sign = bp_freed ? -1 : +1; | ||||
| 	dl->dl_phys->dl_used += | ||||
| 	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); | ||||
| 	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); | ||||
| 	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); | ||||
| 	    sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); | ||||
| 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); | ||||
| 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); | ||||
| 
 | ||||
| 	dle_tofind.dle_mintxg = bp->blk_birth; | ||||
| 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | ||||
| @ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 	} | ||||
| 
 | ||||
| 	ASSERT3P(dle, !=, NULL); | ||||
| 	dle_enqueue(dl, dle, bp, tx); | ||||
| 	dle_enqueue(dl, dle, bp, bp_freed, tx); | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| 	dsl_deadlist_t *dl = arg; | ||||
| 	dsl_deadlist_insert(dl, bp, B_FALSE, tx); | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| 	dsl_deadlist_t *dl = arg; | ||||
| 	dsl_deadlist_insert(dl, bp, B_TRUE, tx); | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Insert new key in deadlist, which must be > all current entries. | ||||
|  * mintxg is not inclusive. | ||||
| @ -316,6 +414,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Remove a deadlist entry and all of its contents by removing the entry from | ||||
|  * the deadlist's avl tree, freeing the entry's bpobj and adjusting the | ||||
|  * deadlist's space accounting accordingly. | ||||
|  */ | ||||
| void | ||||
| dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | ||||
| { | ||||
| 	uint64_t used, comp, uncomp; | ||||
| 	dsl_deadlist_entry_t dle_tofind; | ||||
| 	dsl_deadlist_entry_t *dle; | ||||
| 	objset_t *os = dl->dl_os; | ||||
| 
 | ||||
| 	if (dl->dl_oldfmt) | ||||
| 		return; | ||||
| 
 | ||||
| 	mutex_enter(&dl->dl_lock); | ||||
| 	dsl_deadlist_load_tree(dl); | ||||
| 
 | ||||
| 	dle_tofind.dle_mintxg = mintxg; | ||||
| 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); | ||||
| 	VERIFY3P(dle, !=, NULL); | ||||
| 
 | ||||
| 	avl_remove(&dl->dl_tree, dle); | ||||
| 	VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); | ||||
| 	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); | ||||
| 	dl->dl_phys->dl_used -= used; | ||||
| 	dl->dl_phys->dl_comp -= comp; | ||||
| 	dl->dl_phys->dl_uncomp -= uncomp; | ||||
| 	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { | ||||
| 		bpobj_decr_empty(os, tx); | ||||
| 	} else { | ||||
| 		bpobj_free(os, dle->dle_bpobj.bpo_object, tx); | ||||
| 	} | ||||
| 	bpobj_close(&dle->dle_bpobj); | ||||
| 	kmem_free(dle, sizeof (*dle)); | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Clear out the contents of a deadlist_entry by freeing its bpobj, | ||||
|  * replacing it with an empty bpobj and adjusting the deadlist's | ||||
|  * space accounting | ||||
|  */ | ||||
| void | ||||
| dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	uint64_t new_obj, used, comp, uncomp; | ||||
| 	objset_t *os = dl->dl_os; | ||||
| 
 | ||||
| 	mutex_enter(&dl->dl_lock); | ||||
| 	VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); | ||||
| 	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); | ||||
| 	dl->dl_phys->dl_used -= used; | ||||
| 	dl->dl_phys->dl_comp -= comp; | ||||
| 	dl->dl_phys->dl_uncomp -= uncomp; | ||||
| 	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) | ||||
| 		bpobj_decr_empty(os, tx); | ||||
| 	else | ||||
| 		bpobj_free(os, dle->dle_bpobj.bpo_object, tx); | ||||
| 	bpobj_close(&dle->dle_bpobj); | ||||
| 	new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); | ||||
| 	VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); | ||||
| 	VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, | ||||
| 	    new_obj, tx)); | ||||
| 	ASSERT(bpobj_is_empty(&dle->dle_bpobj)); | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the first entry in deadlist's avl tree | ||||
|  */ | ||||
| dsl_deadlist_entry_t * | ||||
| dsl_deadlist_first(dsl_deadlist_t *dl) | ||||
| { | ||||
| 	dsl_deadlist_entry_t *dle; | ||||
| 
 | ||||
| 	mutex_enter(&dl->dl_lock); | ||||
| 	dsl_deadlist_load_tree(dl); | ||||
| 	dle = avl_first(&dl->dl_tree); | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| 
 | ||||
| 	return (dle); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the last entry in deadlist's avl tree | ||||
|  */ | ||||
| dsl_deadlist_entry_t * | ||||
| dsl_deadlist_last(dsl_deadlist_t *dl) | ||||
| { | ||||
| 	dsl_deadlist_entry_t *dle; | ||||
| 
 | ||||
| 	mutex_enter(&dl->dl_lock); | ||||
| 	dsl_deadlist_load_tree(dl); | ||||
| 	dle = avl_last(&dl->dl_tree); | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| 
 | ||||
| 	return (dle); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Walk ds's snapshots to regenerate generate ZAP & AVL. | ||||
|  */ | ||||
| @ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	dsl_deadlist_t *dl = arg; | ||||
| 	dsl_deadlist_insert(dl, bp, tx); | ||||
| 	dsl_deadlist_insert(dl, bp, bp_freed, tx); | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| @ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, | ||||
| 	} | ||||
| 	mutex_exit(&dl->dl_lock); | ||||
| } | ||||
| 
 | ||||
| typedef struct livelist_entry { | ||||
| 	const blkptr_t *le_bp; | ||||
| 	avl_node_t le_node; | ||||
| } livelist_entry_t; | ||||
| 
 | ||||
| static int | ||||
| livelist_compare(const void *larg, const void *rarg) | ||||
| { | ||||
| 	const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; | ||||
| 	const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; | ||||
| 
 | ||||
| 	/* Sort them according to dva[0] */ | ||||
| 	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); | ||||
| 	uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); | ||||
| 
 | ||||
| 	if (l_dva0_vdev != r_dva0_vdev) | ||||
| 		return (AVL_CMP(l_dva0_vdev, r_dva0_vdev)); | ||||
| 
 | ||||
| 	/* if vdevs are equal, sort by offsets. */ | ||||
| 	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); | ||||
| 	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); | ||||
| 	if (l_dva0_offset == r_dva0_offset) | ||||
| 		ASSERT3U(l->blk_birth, ==, r->blk_birth); | ||||
| 	return (AVL_CMP(l_dva0_offset, r_dva0_offset)); | ||||
| } | ||||
| 
 | ||||
| struct livelist_iter_arg { | ||||
| 	avl_tree_t *avl; | ||||
| 	bplist_t *to_free; | ||||
| 	zthr_t *t; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Expects an AVL tree which is incrementally filled will FREE blkptrs | ||||
|  * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a | ||||
|  * corresponding FREE are stored in the supplied bplist. | ||||
|  */ | ||||
| static int | ||||
| dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	struct livelist_iter_arg *lia = arg; | ||||
| 	avl_tree_t *avl = lia->avl; | ||||
| 	bplist_t *to_free = lia->to_free; | ||||
| 	zthr_t *t = lia->t; | ||||
| 	ASSERT(tx == NULL); | ||||
| 
 | ||||
| 	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) | ||||
| 		return (SET_ERROR(EINTR)); | ||||
| 	if (bp_freed) { | ||||
| 		livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), | ||||
| 		    KM_SLEEP); | ||||
| 		blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); | ||||
| 		*temp_bp = *bp; | ||||
| 		node->le_bp = temp_bp; | ||||
| 		avl_add(avl, node); | ||||
| 	} else { | ||||
| 		livelist_entry_t node; | ||||
| 		node.le_bp = bp; | ||||
| 		livelist_entry_t *found = avl_find(avl, &node, NULL); | ||||
| 		if (found != NULL) { | ||||
| 			avl_remove(avl, found); | ||||
| 			kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); | ||||
| 			kmem_free(found, sizeof (livelist_entry_t)); | ||||
| 		} else { | ||||
| 			bplist_append(to_free, bp); | ||||
| 		} | ||||
| 	} | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs | ||||
|  * which have an ALLOC entry but no matching FREE | ||||
|  */ | ||||
| int | ||||
| dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, | ||||
|     uint64_t *size) | ||||
| { | ||||
| 	avl_tree_t avl; | ||||
| 	avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), | ||||
| 	    offsetof(livelist_entry_t, le_node)); | ||||
| 
 | ||||
| 	/* process the sublist */ | ||||
| 	struct livelist_iter_arg arg = { | ||||
| 	    .avl = &avl, | ||||
| 	    .to_free = to_free, | ||||
| 	    .t = t | ||||
| 	}; | ||||
| 	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); | ||||
| 
 | ||||
| 	avl_destroy(&avl); | ||||
| 	return (err); | ||||
| } | ||||
| 
 | ||||
| #if defined(_KERNEL) | ||||
| /* CSTYLED */ | ||||
| module_param(zfs_livelist_max_entries, ulong, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_max_entries, | ||||
| 	"Size to start the next sub-livelist in a livelist"); | ||||
| 
 | ||||
| module_param(zfs_livelist_min_percent_shared, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_min_percent_shared, | ||||
| 	"Threshold at which livelist is disabled"); | ||||
| #endif | ||||
|  | ||||
| @ -45,6 +45,9 @@ | ||||
| #include <sys/dmu_impl.h> | ||||
| #include <sys/zvol.h> | ||||
| #include <sys/zcp.h> | ||||
| #include <sys/dsl_deadlist.h> | ||||
| #include <sys/zthr.h> | ||||
| #include <sys/spa_impl.h> | ||||
| 
 | ||||
| int | ||||
| dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) | ||||
| @ -120,7 +123,7 @@ struct process_old_arg { | ||||
| }; | ||||
| 
 | ||||
| static int | ||||
| process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) | ||||
| { | ||||
| 	struct process_old_arg *poa = arg; | ||||
| 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; | ||||
| @ -128,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 	ASSERT(!BP_IS_HOLE(bp)); | ||||
| 
 | ||||
| 	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { | ||||
| 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); | ||||
| 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); | ||||
| 		if (poa->ds_prev && !poa->after_branch_point && | ||||
| 		    bp->blk_birth > | ||||
| 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { | ||||
| @ -852,6 +855,127 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) | ||||
| 	dmu_object_free_zapified(mos, ddobj, tx); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dsl_clone_destroy_assert(dsl_dir_t *dd) | ||||
| { | ||||
| 	uint64_t used, comp, uncomp; | ||||
| 
 | ||||
| 	ASSERT(dsl_dir_is_clone(dd)); | ||||
| 	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); | ||||
| 
 | ||||
| 	ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used); | ||||
| 	ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp); | ||||
| 	/*
 | ||||
| 	 * Greater than because we do not track embedded block pointers in | ||||
| 	 * the livelist | ||||
| 	 */ | ||||
| 	ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp); | ||||
| 
 | ||||
| 	ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list)); | ||||
| 	ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Start the delete process for a clone. Free its zil, verify the space usage | ||||
|  * and queue the blkptrs for deletion by adding the livelist to the pool-wide | ||||
|  * delete queue. | ||||
|  */ | ||||
| static void | ||||
| dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| { | ||||
| 	uint64_t zap_obj, to_delete, used, comp, uncomp; | ||||
| 	objset_t *os; | ||||
| 	dsl_dir_t *dd = ds->ds_dir; | ||||
| 	dsl_pool_t *dp = dmu_tx_pool(tx); | ||||
| 	objset_t *mos = dp->dp_meta_objset; | ||||
| 	spa_t *spa = dmu_tx_pool(tx)->dp_spa; | ||||
| 	VERIFY0(dmu_objset_from_ds(ds, &os)); | ||||
| 
 | ||||
| 	/* Check that the clone is in a correct state to be deleted */ | ||||
| 	dsl_clone_destroy_assert(dd); | ||||
| 
 | ||||
| 	/* Destroy the zil */ | ||||
| 	zil_destroy_sync(dmu_objset_zil(os), tx); | ||||
| 
 | ||||
| 	VERIFY0(zap_lookup(mos, dd->dd_object, | ||||
| 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete)); | ||||
| 	/* Initialize deleted_clones entry to track livelists to cleanup */ | ||||
| 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, | ||||
| 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); | ||||
| 	if (error == ENOENT) { | ||||
| 		zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, | ||||
| 		    DMU_OT_NONE, 0, tx); | ||||
| 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, | ||||
| 		    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, | ||||
| 		    &(zap_obj), tx)); | ||||
| 		spa->spa_livelists_to_delete = zap_obj; | ||||
| 	} else if (error != 0) { | ||||
| 		zfs_panic_recover("zfs: error %d was returned while looking " | ||||
| 		    "up DMU_POOL_DELETED_CLONES in the zap"); | ||||
| 		return; | ||||
| 	} | ||||
| 	VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); | ||||
| 
 | ||||
| 	/* Clone is no longer using space, now tracked by dp_free_dir */ | ||||
| 	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); | ||||
| 	dsl_dir_diduse_space(dd, DD_USED_HEAD, | ||||
| 	    -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes, | ||||
| 	    tx); | ||||
| 	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, | ||||
| 	    used, comp, uncomp, tx); | ||||
| 	dsl_dir_remove_livelist(dd, tx, B_FALSE); | ||||
| 	zthr_wakeup(spa->spa_livelist_delete_zthr); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Move the bptree into the pool's list of trees to clean up, update space | ||||
|  * accounting information and destroy the zil. | ||||
|  */ | ||||
| void | ||||
| dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| { | ||||
| 	uint64_t used, comp, uncomp; | ||||
| 	objset_t *os; | ||||
| 
 | ||||
| 	VERIFY0(dmu_objset_from_ds(ds, &os)); | ||||
| 	dsl_pool_t *dp = dmu_tx_pool(tx); | ||||
| 	objset_t *mos = dp->dp_meta_objset; | ||||
| 
 | ||||
| 	zil_destroy_sync(dmu_objset_zil(os), tx); | ||||
| 
 | ||||
| 	if (!spa_feature_is_active(dp->dp_spa, | ||||
| 	    SPA_FEATURE_ASYNC_DESTROY)) { | ||||
| 		dsl_scan_t *scn = dp->dp_scan; | ||||
| 		spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, | ||||
| 		    tx); | ||||
| 		dp->dp_bptree_obj = bptree_alloc(mos, tx); | ||||
| 		VERIFY0(zap_add(mos, | ||||
| 		    DMU_POOL_DIRECTORY_OBJECT, | ||||
| 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, | ||||
| 		    &dp->dp_bptree_obj, tx)); | ||||
| 		ASSERT(!scn->scn_async_destroying); | ||||
| 		scn->scn_async_destroying = B_TRUE; | ||||
| 	} | ||||
| 
 | ||||
| 	used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; | ||||
| 	comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; | ||||
| 	uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; | ||||
| 
 | ||||
| 	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || | ||||
| 	    dsl_dataset_phys(ds)->ds_unique_bytes == used); | ||||
| 
 | ||||
| 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); | ||||
| 	bptree_add(mos, dp->dp_bptree_obj, | ||||
| 	    &dsl_dataset_phys(ds)->ds_bp, | ||||
| 	    dsl_dataset_phys(ds)->ds_prev_snap_txg, | ||||
| 	    used, comp, uncomp, tx); | ||||
| 	rrw_exit(&ds->ds_bp_rwlock, FTAG); | ||||
| 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, | ||||
| 	    -used, -comp, -uncomp, tx); | ||||
| 	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, | ||||
| 	    used, comp, uncomp, tx); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| { | ||||
| @ -924,51 +1048,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| 	if (dsl_dataset_remap_deadlist_exists(ds)) | ||||
| 		dsl_dataset_destroy_remap_deadlist(ds, tx); | ||||
| 
 | ||||
| 	objset_t *os; | ||||
| 	VERIFY0(dmu_objset_from_ds(ds, &os)); | ||||
| 
 | ||||
| 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { | ||||
| 		old_synchronous_dataset_destroy(ds, tx); | ||||
| 	} else { | ||||
| 	/*
 | ||||
| 		 * Move the bptree into the pool's list of trees to | ||||
| 		 * clean up and update space accounting information. | ||||
| 	 * Each destroy is responsible for both destroying (enqueuing | ||||
| 	 * to be destroyed) the blkptrs comprising the dataset as well as | ||||
| 	 * those belonging to the zil. | ||||
| 	 */ | ||||
| 		uint64_t used, comp, uncomp; | ||||
| 
 | ||||
| 		zil_destroy_sync(dmu_objset_zil(os), tx); | ||||
| 
 | ||||
| 		if (!spa_feature_is_active(dp->dp_spa, | ||||
| 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { | ||||
| 		dsl_async_clone_destroy(ds, tx); | ||||
| 	} else if (spa_feature_is_enabled(dp->dp_spa, | ||||
| 	    SPA_FEATURE_ASYNC_DESTROY)) { | ||||
| 			dsl_scan_t *scn = dp->dp_scan; | ||||
| 			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, | ||||
| 			    tx); | ||||
| 			dp->dp_bptree_obj = bptree_alloc(mos, tx); | ||||
| 			VERIFY0(zap_add(mos, | ||||
| 			    DMU_POOL_DIRECTORY_OBJECT, | ||||
| 			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, | ||||
| 			    &dp->dp_bptree_obj, tx)); | ||||
| 			ASSERT(!scn->scn_async_destroying); | ||||
| 			scn->scn_async_destroying = B_TRUE; | ||||
| 		} | ||||
| 
 | ||||
| 		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; | ||||
| 		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; | ||||
| 		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; | ||||
| 
 | ||||
| 		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || | ||||
| 		    dsl_dataset_phys(ds)->ds_unique_bytes == used); | ||||
| 
 | ||||
| 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); | ||||
| 		bptree_add(mos, dp->dp_bptree_obj, | ||||
| 		    &dsl_dataset_phys(ds)->ds_bp, | ||||
| 		    dsl_dataset_phys(ds)->ds_prev_snap_txg, | ||||
| 		    used, comp, uncomp, tx); | ||||
| 		rrw_exit(&ds->ds_bp_rwlock, FTAG); | ||||
| 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, | ||||
| 		    -used, -comp, -uncomp, tx); | ||||
| 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, | ||||
| 		    used, comp, uncomp, tx); | ||||
| 		dsl_async_dataset_destroy(ds, tx); | ||||
| 	} else { | ||||
| 		old_synchronous_dataset_destroy(ds, tx); | ||||
| 	} | ||||
| 
 | ||||
| 	if (ds->ds_prev != NULL) { | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013 Martin Matuska. All rights reserved. | ||||
|  * Copyright (c) 2014 Joyent, Inc. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
| @ -48,6 +48,7 @@ | ||||
| #include <sys/policy.h> | ||||
| #include <sys/zfs_znode.h> | ||||
| #include <sys/zvol.h> | ||||
| #include <sys/zthr.h> | ||||
| #include "zfs_namecheck.h" | ||||
| #include "zfs_prop.h" | ||||
| 
 | ||||
| @ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu) | ||||
| 
 | ||||
| 	spa_async_close(dd->dd_pool->dp_spa, dd); | ||||
| 
 | ||||
| 	if (dsl_deadlist_is_open(&dd->dd_livelist)) | ||||
| 		dsl_dir_livelist_close(dd); | ||||
| 
 | ||||
| 	dsl_prop_fini(dd); | ||||
| 	mutex_destroy(&dd->dd_lock); | ||||
| 	kmem_free(dd, sizeof (dsl_dir_t)); | ||||
| @ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, | ||||
| 			dd->dd_origin_txg = | ||||
| 			    origin_phys->ds_creation_txg; | ||||
| 			dmu_buf_rele(origin_bonus, FTAG); | ||||
| 			if (dsl_dir_is_zapified(dd)) { | ||||
| 				uint64_t obj; | ||||
| 				err = zap_lookup(dp->dp_meta_objset, | ||||
| 				    dd->dd_object, DD_FIELD_LIVELIST, | ||||
| 				    sizeof (uint64_t), 1, &obj); | ||||
| 				if (err == 0) | ||||
| 					dsl_dir_livelist_open(dd, obj); | ||||
| 				else if (err != ENOENT) | ||||
| 					goto errout; | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, | ||||
| @ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, | ||||
| 		if (winner != NULL) { | ||||
| 			if (dd->dd_parent) | ||||
| 				dsl_dir_rele(dd->dd_parent, dd); | ||||
| 			if (dsl_deadlist_is_open(&dd->dd_livelist)) | ||||
| 				dsl_dir_livelist_close(dd); | ||||
| 			dsl_prop_fini(dd); | ||||
| 			mutex_destroy(&dd->dd_lock); | ||||
| 			kmem_free(dd, sizeof (dsl_dir_t)); | ||||
| @ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, | ||||
| errout: | ||||
| 	if (dd->dd_parent) | ||||
| 		dsl_dir_rele(dd->dd_parent, dd); | ||||
| 	if (dsl_deadlist_is_open(&dd->dd_livelist)) | ||||
| 		dsl_dir_livelist_close(dd); | ||||
| 	dsl_prop_fini(dd); | ||||
| 	mutex_destroy(&dd->dd_lock); | ||||
| 	kmem_free(dd, sizeof (dsl_dir_t)); | ||||
| @ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd) | ||||
| 	return (doi.doi_type == DMU_OTN_ZAP_METADATA); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) | ||||
| { | ||||
| 	objset_t *mos = dd->dd_pool->dp_meta_objset; | ||||
| 	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, | ||||
| 	    SPA_FEATURE_LIVELIST)); | ||||
| 	dsl_deadlist_open(&dd->dd_livelist, mos, obj); | ||||
| 	bplist_create(&dd->dd_pending_allocs); | ||||
| 	bplist_create(&dd->dd_pending_frees); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_dir_livelist_close(dsl_dir_t *dd) | ||||
| { | ||||
| 	dsl_deadlist_close(&dd->dd_livelist); | ||||
| 	bplist_destroy(&dd->dd_pending_allocs); | ||||
| 	bplist_destroy(&dd->dd_pending_frees); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) | ||||
| { | ||||
| 	uint64_t obj; | ||||
| 	dsl_pool_t *dp = dmu_tx_pool(tx); | ||||
| 	spa_t *spa = dp->dp_spa; | ||||
| 	livelist_condense_entry_t to_condense = spa->spa_to_condense; | ||||
| 
 | ||||
| 	if (!dsl_deadlist_is_open(&dd->dd_livelist)) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the livelist being removed is set to be condensed, stop the | ||||
| 	 * condense zthr and indicate the cancellation in the spa_to_condense | ||||
| 	 * struct in case the condense no-wait synctask has already started | ||||
| 	 */ | ||||
| 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; | ||||
| 	if (ll_condense_thread != NULL && | ||||
| 	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { | ||||
| 			/*
 | ||||
| 			 * We use zthr_wait_cycle_done instead of zthr_cancel | ||||
| 			 * because we don't want to destroy the zthr, just have | ||||
| 			 * it skip its current task. | ||||
| 			 */ | ||||
| 			spa->spa_to_condense.cancelled = B_TRUE; | ||||
| 			zthr_wait_cycle_done(ll_condense_thread); | ||||
| 			/*
 | ||||
| 			 * If we've returned from zthr_wait_cycle_done without | ||||
| 			 * clearing the to_condense data structure it's either | ||||
| 			 * because the no-wait synctask has started (which is | ||||
| 			 * indicated by 'syncing' field of to_condense) and we | ||||
| 			 * can expect it to clear to_condense on its own. | ||||
| 			 * Otherwise, we returned before the zthr ran. The | ||||
| 			 * checkfunc will now fail as cancelled == B_TRUE so we | ||||
| 			 * can safely NULL out ds, allowing a different dir's | ||||
| 			 * livelist to be condensed. | ||||
| 			 * | ||||
| 			 * We can be sure that the to_condense struct will not | ||||
| 			 * be repopulated at this stage because both this | ||||
| 			 * function and dsl_livelist_try_condense execute in | ||||
| 			 * syncing context. | ||||
| 			 */ | ||||
| 			if ((spa->spa_to_condense.ds != NULL) && | ||||
| 			    !spa->spa_to_condense.syncing) { | ||||
| 				dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, | ||||
| 				    spa); | ||||
| 				spa->spa_to_condense.ds = NULL; | ||||
| 			} | ||||
| 	} | ||||
| 
 | ||||
| 	dsl_dir_livelist_close(dd); | ||||
| 	int err = zap_lookup(dp->dp_meta_objset, dd->dd_object, | ||||
| 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); | ||||
| 	if (err == 0) { | ||||
| 		VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, | ||||
| 		    DD_FIELD_LIVELIST, tx)); | ||||
| 		if (total) { | ||||
| 			dsl_deadlist_free(dp->dp_meta_objset, obj, tx); | ||||
| 			spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); | ||||
| 		} | ||||
| 	} else { | ||||
| 		ASSERT3U(err, !=, ENOENT); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #if defined(_KERNEL) | ||||
| EXPORT_SYMBOL(dsl_dir_set_quota); | ||||
| EXPORT_SYMBOL(dsl_dir_set_reservation); | ||||
|  | ||||
| @ -721,7 +721,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) | ||||
| 	 * Now that the datasets have been completely synced, we can | ||||
| 	 * clean up our in-memory structures accumulated while syncing: | ||||
| 	 * | ||||
| 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist | ||||
| 	 *  - move dead blocks from the pending deadlist and livelists | ||||
| 	 *    to the on-disk versions | ||||
| 	 *  - release hold from dsl_dataset_dirty() | ||||
| 	 *  - release key mapping hold from dsl_dataset_dirty() | ||||
| 	 */ | ||||
|  | ||||
| @ -3103,8 +3103,18 @@ dsl_scan_update_stats(dsl_scan_t *scn) | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT(!bp_freed); | ||||
| 	return (dsl_scan_free_block_cb(arg, bp, tx)); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT(!bp_freed); | ||||
| 	dsl_scan_t *scn = arg; | ||||
| 	const dva_t *dva = &bp->blk_dva[0]; | ||||
| 
 | ||||
| @ -3123,6 +3133,7 @@ dsl_scan_active(dsl_scan_t *scn) | ||||
| { | ||||
| 	spa_t *spa = scn->scn_dp->dp_spa; | ||||
| 	uint64_t used = 0, comp, uncomp; | ||||
| 	boolean_t clones_left; | ||||
| 
 | ||||
| 	if (spa->spa_load_state != SPA_LOAD_NONE) | ||||
| 		return (B_FALSE); | ||||
| @ -3136,7 +3147,8 @@ dsl_scan_active(dsl_scan_t *scn) | ||||
| 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj, | ||||
| 		    &used, &comp, &uncomp); | ||||
| 	} | ||||
| 	return (used != 0); | ||||
| 	clones_left = spa_livelist_delete_check(spa); | ||||
| 	return ((used != 0) || (clones_left)); | ||||
| } | ||||
| 
 | ||||
| static boolean_t | ||||
| @ -3233,7 +3245,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) | ||||
| 		scn->scn_zio_root = zio_root(spa, NULL, | ||||
| 		    NULL, ZIO_FLAG_MUSTSUCCEED); | ||||
| 		err = bpobj_iterate(&dp->dp_free_bpobj, | ||||
| 		    dsl_scan_free_block_cb, scn, tx); | ||||
| 		    bpobj_dsl_scan_free_block_cb, scn, tx); | ||||
| 		VERIFY0(zio_wait(scn->scn_zio_root)); | ||||
| 		scn->scn_zio_root = NULL; | ||||
| 
 | ||||
| @ -3330,7 +3342,8 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) | ||||
| 		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); | ||||
| 	} | ||||
| 
 | ||||
| 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { | ||||
| 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && | ||||
| 	    !spa_livelist_delete_check(spa)) { | ||||
| 		/* finished; verify that space accounting went to zero */ | ||||
| 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); | ||||
| 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); | ||||
|  | ||||
							
								
								
									
										499
									
								
								module/zfs/spa.c
									
									
									
									
									
								
							
							
						
						
									
										499
									
								
								module/zfs/spa.c
									
									
									
									
									
								
							| @ -232,6 +232,27 @@ uint64_t	zfs_max_missing_tvds_scan = 0; | ||||
|  */ | ||||
| boolean_t	zfs_pause_spa_sync = B_FALSE; | ||||
| 
 | ||||
| /*
 | ||||
|  * Variables to indicate the livelist condense zthr func should wait at certain | ||||
|  * points for the livelist to be removed - used to test condense/destroy races | ||||
|  */ | ||||
| int zfs_livelist_condense_zthr_pause = 0; | ||||
| int zfs_livelist_condense_sync_pause = 0; | ||||
| 
 | ||||
| /*
 | ||||
|  * Variables to track whether or not condense cancellation has been | ||||
|  * triggered in testing. | ||||
|  */ | ||||
| int zfs_livelist_condense_sync_cancel = 0; | ||||
| int zfs_livelist_condense_zthr_cancel = 0; | ||||
| 
 | ||||
| /*
 | ||||
|  * Variable to track whether or not extra ALLOC blkptrs were added to a | ||||
|  * livelist entry while it was being condensed (caused by the way we track | ||||
|  * remapped blkptrs in dbuf_remap_impl) | ||||
|  */ | ||||
| int zfs_livelist_condense_new_alloc = 0; | ||||
| 
 | ||||
| /*
 | ||||
|  * ========================================================================== | ||||
|  * SPA properties routines | ||||
| @ -1481,6 +1502,27 @@ spa_unload_log_sm_metadata(spa_t *spa) | ||||
| 	spa->spa_unflushed_stats.sus_blocklimit = 0; | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| spa_destroy_aux_threads(spa_t *spa) | ||||
| { | ||||
| 	if (spa->spa_condense_zthr != NULL) { | ||||
| 		zthr_destroy(spa->spa_condense_zthr); | ||||
| 		spa->spa_condense_zthr = NULL; | ||||
| 	} | ||||
| 	if (spa->spa_checkpoint_discard_zthr != NULL) { | ||||
| 		zthr_destroy(spa->spa_checkpoint_discard_zthr); | ||||
| 		spa->spa_checkpoint_discard_zthr = NULL; | ||||
| 	} | ||||
| 	if (spa->spa_livelist_delete_zthr != NULL) { | ||||
| 		zthr_destroy(spa->spa_livelist_delete_zthr); | ||||
| 		spa->spa_livelist_delete_zthr = NULL; | ||||
| 	} | ||||
| 	if (spa->spa_livelist_condense_zthr != NULL) { | ||||
| 		zthr_destroy(spa->spa_livelist_condense_zthr); | ||||
| 		spa->spa_livelist_condense_zthr = NULL; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Opposite of spa_load(). | ||||
|  */ | ||||
| @ -1552,15 +1594,7 @@ spa_unload(spa_t *spa) | ||||
| 		spa->spa_vdev_removal = NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	if (spa->spa_condense_zthr != NULL) { | ||||
| 		zthr_destroy(spa->spa_condense_zthr); | ||||
| 		spa->spa_condense_zthr = NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	if (spa->spa_checkpoint_discard_zthr != NULL) { | ||||
| 		zthr_destroy(spa->spa_checkpoint_discard_zthr); | ||||
| 		spa->spa_checkpoint_discard_zthr = NULL; | ||||
| 	} | ||||
| 	spa_destroy_aux_threads(spa); | ||||
| 
 | ||||
| 	spa_condense_fini(spa); | ||||
| 
 | ||||
| @ -2335,6 +2369,376 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) | ||||
| 	return (SET_ERROR(err)); | ||||
| } | ||||
| 
 | ||||
| boolean_t | ||||
| spa_livelist_delete_check(spa_t *spa) | ||||
| { | ||||
| 	return (spa->spa_livelists_to_delete != 0); | ||||
| } | ||||
| 
 | ||||
| /* ARGSUSED */ | ||||
| static boolean_t | ||||
| spa_livelist_delete_cb_check(void *arg, zthr_t *z) | ||||
| { | ||||
| 	spa_t *spa = arg; | ||||
| 	return (spa_livelist_delete_check(spa)); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| 	spa_t *spa = arg; | ||||
| 	zio_free(spa, tx->tx_txg, bp); | ||||
| 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, | ||||
| 	    -bp_get_dsize_sync(spa, bp), | ||||
| 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) | ||||
| { | ||||
| 	int err; | ||||
| 	zap_cursor_t zc; | ||||
| 	zap_attribute_t za; | ||||
| 	zap_cursor_init(&zc, os, zap_obj); | ||||
| 	err = zap_cursor_retrieve(&zc, &za); | ||||
| 	zap_cursor_fini(&zc); | ||||
| 	if (err == 0) | ||||
| 		*llp = za.za_first_integer; | ||||
| 	return (err); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Components of livelist deletion that must be performed in syncing | ||||
|  * context: freeing block pointers and updating the pool-wide data | ||||
|  * structures to indicate how much work is left to do | ||||
|  */ | ||||
| typedef struct sublist_delete_arg { | ||||
| 	spa_t *spa; | ||||
| 	dsl_deadlist_t *ll; | ||||
| 	uint64_t key; | ||||
| 	bplist_t *to_free; | ||||
| } sublist_delete_arg_t; | ||||
| 
 | ||||
| static void | ||||
| sublist_delete_sync(void *arg, dmu_tx_t *tx) | ||||
| { | ||||
| 	sublist_delete_arg_t *sda = arg; | ||||
| 	spa_t *spa = sda->spa; | ||||
| 	dsl_deadlist_t *ll = sda->ll; | ||||
| 	uint64_t key = sda->key; | ||||
| 	bplist_t *to_free = sda->to_free; | ||||
| 
 | ||||
| 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx); | ||||
| 	dsl_deadlist_remove_entry(ll, key, tx); | ||||
| } | ||||
| 
 | ||||
| typedef struct livelist_delete_arg { | ||||
| 	spa_t *spa; | ||||
| 	uint64_t ll_obj; | ||||
| 	uint64_t zap_obj; | ||||
| } livelist_delete_arg_t; | ||||
| 
 | ||||
| static void | ||||
| livelist_delete_sync(void *arg, dmu_tx_t *tx) | ||||
| { | ||||
| 	livelist_delete_arg_t *lda = arg; | ||||
| 	spa_t *spa = lda->spa; | ||||
| 	uint64_t ll_obj = lda->ll_obj; | ||||
| 	uint64_t zap_obj = lda->zap_obj; | ||||
| 	objset_t *mos = spa->spa_meta_objset; | ||||
| 	uint64_t count; | ||||
| 
 | ||||
| 	/* free the livelist and decrement the feature count */ | ||||
| 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); | ||||
| 	dsl_deadlist_free(mos, ll_obj, tx); | ||||
| 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); | ||||
| 	VERIFY0(zap_count(mos, zap_obj, &count)); | ||||
| 	if (count == 0) { | ||||
| 		/* no more livelists to delete */ | ||||
| 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, | ||||
| 		    DMU_POOL_DELETED_CLONES, tx)); | ||||
| 		VERIFY0(zap_destroy(mos, zap_obj, tx)); | ||||
| 		spa->spa_livelists_to_delete = 0; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Load in the value for the livelist to be removed and open it. Then, | ||||
|  * load its first sublist and determine which block pointers should actually | ||||
|  * be freed. Then, call a synctask which performs the actual frees and updates | ||||
|  * the pool-wide livelist data. | ||||
|  */ | ||||
| /* ARGSUSED */ | ||||
| void | ||||
| spa_livelist_delete_cb(void *arg, zthr_t *z) | ||||
| { | ||||
| 	spa_t *spa = arg; | ||||
| 	uint64_t ll_obj = 0, count; | ||||
| 	objset_t *mos = spa->spa_meta_objset; | ||||
| 	uint64_t zap_obj = spa->spa_livelists_to_delete; | ||||
| 	/*
 | ||||
| 	 * Determine the next livelist to delete. This function should only | ||||
| 	 * be called if there is at least one deleted clone. | ||||
| 	 */ | ||||
| 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); | ||||
| 	VERIFY0(zap_count(mos, ll_obj, &count)); | ||||
| 	if (count > 0) { | ||||
| 		dsl_deadlist_t ll = { 0 }; | ||||
| 		dsl_deadlist_entry_t *dle; | ||||
| 		bplist_t to_free; | ||||
| 		dsl_deadlist_open(&ll, mos, ll_obj); | ||||
| 		dle = dsl_deadlist_first(&ll); | ||||
| 		ASSERT3P(dle, !=, NULL); | ||||
| 		bplist_create(&to_free); | ||||
| 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, | ||||
| 		    z, NULL); | ||||
| 		if (err == 0) { | ||||
| 			sublist_delete_arg_t sync_arg = { | ||||
| 			    .spa = spa, | ||||
| 			    .ll = &ll, | ||||
| 			    .key = dle->dle_mintxg, | ||||
| 			    .to_free = &to_free | ||||
| 			}; | ||||
| 			zfs_dbgmsg("deleting sublist (id %llu) from" | ||||
| 			    " livelist %llu, %d remaining", | ||||
| 			    dle->dle_bpobj.bpo_object, ll_obj, count - 1); | ||||
| 			VERIFY0(dsl_sync_task(spa_name(spa), NULL, | ||||
| 			    sublist_delete_sync, &sync_arg, 0, | ||||
| 			    ZFS_SPACE_CHECK_DESTROY)); | ||||
| 		} else { | ||||
| 			ASSERT(err == EINTR); | ||||
| 		} | ||||
| 		bplist_clear(&to_free); | ||||
| 		bplist_destroy(&to_free); | ||||
| 		dsl_deadlist_close(&ll); | ||||
| 	} else { | ||||
| 		livelist_delete_arg_t sync_arg = { | ||||
| 		    .spa = spa, | ||||
| 		    .ll_obj = ll_obj, | ||||
| 		    .zap_obj = zap_obj | ||||
| 		}; | ||||
| 		zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); | ||||
| 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, | ||||
| 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void | ||||
| spa_start_livelist_destroy_thread(spa_t *spa) | ||||
| { | ||||
| 	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); | ||||
| 	spa->spa_livelist_delete_zthr = zthr_create( | ||||
| 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa); | ||||
| } | ||||
| 
 | ||||
| typedef struct livelist_new_arg { | ||||
| 	bplist_t *allocs; | ||||
| 	bplist_t *frees; | ||||
| } livelist_new_arg_t; | ||||
| 
 | ||||
| static int | ||||
| livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT(tx == NULL); | ||||
| 	livelist_new_arg_t *lna = arg; | ||||
| 	if (bp_freed) { | ||||
| 		bplist_append(lna->frees, bp); | ||||
| 	} else { | ||||
| 		bplist_append(lna->allocs, bp); | ||||
| 		zfs_livelist_condense_new_alloc++; | ||||
| 	} | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| typedef struct livelist_condense_arg { | ||||
| 	spa_t *spa; | ||||
| 	bplist_t to_keep; | ||||
| 	uint64_t first_size; | ||||
| 	uint64_t next_size; | ||||
| } livelist_condense_arg_t; | ||||
| 
 | ||||
| static void | ||||
| spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) | ||||
| { | ||||
| 	livelist_condense_arg_t *lca = arg; | ||||
| 	spa_t *spa = lca->spa; | ||||
| 	bplist_t new_frees; | ||||
| 	dsl_dataset_t *ds = spa->spa_to_condense.ds; | ||||
| 
 | ||||
| 	/* Have we been cancelled? */ | ||||
| 	if (spa->spa_to_condense.cancelled) { | ||||
| 		zfs_livelist_condense_sync_cancel++; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first; | ||||
| 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next; | ||||
| 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * It's possible that the livelist was changed while the zthr was | ||||
| 	 * running. Therefore, we need to check for new blkptrs in the two | ||||
| 	 * entries being condensed and continue to track them in the livelist. | ||||
| 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), | ||||
| 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so | ||||
| 	 * we need to sort them into two different bplists. | ||||
| 	 */ | ||||
| 	uint64_t first_obj = first->dle_bpobj.bpo_object; | ||||
| 	uint64_t next_obj = next->dle_bpobj.bpo_object; | ||||
| 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; | ||||
| 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; | ||||
| 
 | ||||
| 	bplist_create(&new_frees); | ||||
| 	livelist_new_arg_t new_bps = { | ||||
| 	    .allocs = &lca->to_keep, | ||||
| 	    .frees = &new_frees, | ||||
| 	}; | ||||
| 
 | ||||
| 	if (cur_first_size > lca->first_size) { | ||||
| 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, | ||||
| 		    livelist_track_new_cb, &new_bps, lca->first_size)); | ||||
| 	} | ||||
| 	if (cur_next_size > lca->next_size) { | ||||
| 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, | ||||
| 		    livelist_track_new_cb, &new_bps, lca->next_size)); | ||||
| 	} | ||||
| 
 | ||||
| 	dsl_deadlist_clear_entry(first, ll, tx); | ||||
| 	ASSERT(bpobj_is_empty(&first->dle_bpobj)); | ||||
| 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); | ||||
| 
 | ||||
| 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); | ||||
| 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); | ||||
| 	bplist_destroy(&new_frees); | ||||
| 
 | ||||
| 	char dsname[ZFS_MAX_DATASET_NAME_LEN]; | ||||
| 	dsl_dataset_name(ds, dsname); | ||||
| 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " | ||||
| 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " | ||||
| 	    "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj, | ||||
| 	    cur_first_size, next_obj, cur_next_size, | ||||
| 	    first->dle_bpobj.bpo_object, | ||||
| 	    first->dle_bpobj.bpo_phys->bpo_num_blkptrs); | ||||
| out: | ||||
| 	dmu_buf_rele(ds->ds_dbuf, spa); | ||||
| 	spa->spa_to_condense.ds = NULL; | ||||
| 	bplist_clear(&lca->to_keep); | ||||
| 	bplist_destroy(&lca->to_keep); | ||||
| 	kmem_free(lca, sizeof (livelist_condense_arg_t)); | ||||
| 	spa->spa_to_condense.syncing = B_FALSE; | ||||
| } | ||||
| 
 | ||||
| void | ||||
| spa_livelist_condense_cb(void *arg, zthr_t *t) | ||||
| { | ||||
| 	while (zfs_livelist_condense_zthr_pause && | ||||
| 	    !(zthr_has_waiters(t) || zthr_iscancelled(t))) | ||||
| 		delay(1); | ||||
| 
 | ||||
| 	spa_t *spa = arg; | ||||
| 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first; | ||||
| 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next; | ||||
| 	uint64_t first_size, next_size; | ||||
| 
 | ||||
| 	livelist_condense_arg_t *lca = | ||||
| 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); | ||||
| 	bplist_create(&lca->to_keep); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Process the livelists (matching FREEs and ALLOCs) in open context | ||||
| 	 * so we have minimal work in syncing context to condense. | ||||
| 	 * | ||||
| 	 * We save bpobj sizes (first_size and next_size) to use later in | ||||
| 	 * syncing context to determine if entries were added to these sublists | ||||
| 	 * while in open context. This is possible because the clone is still | ||||
| 	 * active and open for normal writes and we want to make sure the new, | ||||
| 	 * unprocessed blockpointers are inserted into the livelist normally. | ||||
| 	 * | ||||
| 	 * Note that dsl_process_sub_livelist() both stores the size number of | ||||
| 	 * blockpointers and iterates over them while the bpobj's lock held, so | ||||
| 	 * the sizes returned to us are consistent which what was actually | ||||
| 	 * processed. | ||||
| 	 */ | ||||
| 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, | ||||
| 	    &first_size); | ||||
| 	if (err == 0) | ||||
| 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, | ||||
| 		    t, &next_size); | ||||
| 
 | ||||
| 	if (err == 0) { | ||||
| 		while (zfs_livelist_condense_sync_pause && | ||||
| 		    !(zthr_has_waiters(t) || zthr_iscancelled(t))) | ||||
| 			delay(1); | ||||
| 
 | ||||
| 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | ||||
| 		dmu_tx_mark_netfree(tx); | ||||
| 		dmu_tx_hold_space(tx, 1); | ||||
| 		err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); | ||||
| 		if (err == 0) { | ||||
| 			/*
 | ||||
| 			 * Prevent the condense zthr restarting before | ||||
| 			 * the synctask completes. | ||||
| 			 */ | ||||
| 			spa->spa_to_condense.syncing = B_TRUE; | ||||
| 			lca->spa = spa; | ||||
| 			lca->first_size = first_size; | ||||
| 			lca->next_size = next_size; | ||||
| 			dsl_sync_task_nowait(spa_get_dsl(spa), | ||||
| 			    spa_livelist_condense_sync, lca, 0, | ||||
| 			    ZFS_SPACE_CHECK_NONE, tx); | ||||
| 			dmu_tx_commit(tx); | ||||
| 			return; | ||||
| 		} | ||||
| 	} | ||||
| 	/*
 | ||||
| 	 * Condensing can not continue: either it was externally stopped or | ||||
| 	 * we were unable to assign to a tx because the pool has run out of | ||||
| 	 * space. In the second case, we'll just end up trying to condense | ||||
| 	 * again in a later txg. | ||||
| 	 */ | ||||
| 	ASSERT(err != 0); | ||||
| 	bplist_clear(&lca->to_keep); | ||||
| 	bplist_destroy(&lca->to_keep); | ||||
| 	kmem_free(lca, sizeof (livelist_condense_arg_t)); | ||||
| 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); | ||||
| 	spa->spa_to_condense.ds = NULL; | ||||
| 	if (err == EINTR) | ||||
| 		zfs_livelist_condense_zthr_cancel++; | ||||
| } | ||||
| 
 | ||||
| /* ARGSUSED */ | ||||
| /*
 | ||||
|  * Check that there is something to condense but that a condense is not | ||||
|  * already in progress and that condensing has not been cancelled. | ||||
|  */ | ||||
| static boolean_t | ||||
| spa_livelist_condense_cb_check(void *arg, zthr_t *z) | ||||
| { | ||||
| 	spa_t *spa = arg; | ||||
| 	if ((spa->spa_to_condense.ds != NULL) && | ||||
| 	    (spa->spa_to_condense.syncing == B_FALSE) && | ||||
| 	    (spa->spa_to_condense.cancelled == B_FALSE)) { | ||||
| 		return (B_TRUE); | ||||
| 	} | ||||
| 	return (B_FALSE); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| spa_start_livelist_condensing_thread(spa_t *spa) | ||||
| { | ||||
| 	spa->spa_to_condense.ds = NULL; | ||||
| 	spa->spa_to_condense.first = NULL; | ||||
| 	spa->spa_to_condense.next = NULL; | ||||
| 	spa->spa_to_condense.syncing = B_FALSE; | ||||
| 	spa->spa_to_condense.cancelled = B_FALSE; | ||||
| 
 | ||||
| 	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); | ||||
| 	spa->spa_livelist_condense_zthr = zthr_create( | ||||
| 	    spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| spa_spawn_aux_threads(spa_t *spa) | ||||
| { | ||||
| @ -2343,6 +2747,8 @@ spa_spawn_aux_threads(spa_t *spa) | ||||
| 	ASSERT(MUTEX_HELD(&spa_namespace_lock)); | ||||
| 
 | ||||
| 	spa_start_indirect_condensing_thread(spa); | ||||
| 	spa_start_livelist_destroy_thread(spa); | ||||
| 	spa_start_livelist_condensing_thread(spa); | ||||
| 
 | ||||
| 	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); | ||||
| 	spa->spa_checkpoint_discard_zthr = | ||||
| @ -3603,6 +4009,15 @@ spa_ld_get_props(spa_t *spa) | ||||
| 	if (error != 0 && error != ENOENT) | ||||
| 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Load the livelist deletion field. If a livelist is queued for | ||||
| 	 * deletion, indicate that in the spa | ||||
| 	 */ | ||||
| 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, | ||||
| 	    &spa->spa_livelists_to_delete, B_FALSE); | ||||
| 	if (error != 0 && error != ENOENT) | ||||
| 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Load the history object.  If we have an older pool, this | ||||
| 	 * will not be present. | ||||
| @ -7571,6 +7986,14 @@ spa_async_suspend(spa_t *spa) | ||||
| 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; | ||||
| 	if (discard_thread != NULL) | ||||
| 		zthr_cancel(discard_thread); | ||||
| 
 | ||||
| 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; | ||||
| 	if (ll_delete_thread != NULL) | ||||
| 		zthr_cancel(ll_delete_thread); | ||||
| 
 | ||||
| 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; | ||||
| 	if (ll_condense_thread != NULL) | ||||
| 		zthr_cancel(ll_condense_thread); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| @ -7589,6 +8012,14 @@ spa_async_resume(spa_t *spa) | ||||
| 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; | ||||
| 	if (discard_thread != NULL) | ||||
| 		zthr_resume(discard_thread); | ||||
| 
 | ||||
| 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; | ||||
| 	if (ll_delete_thread != NULL) | ||||
| 		zthr_resume(ll_delete_thread); | ||||
| 
 | ||||
| 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; | ||||
| 	if (ll_condense_thread != NULL) | ||||
| 		zthr_resume(ll_condense_thread); | ||||
| } | ||||
| 
 | ||||
| static boolean_t | ||||
| @ -7639,14 +8070,28 @@ spa_async_request(spa_t *spa, int task) | ||||
|  * ========================================================================== | ||||
|  */ | ||||
| 
 | ||||
| 
 | ||||
| static int | ||||
| bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	bpobj_t *bpo = arg; | ||||
| 	bpobj_enqueue(bpo, bp, tx); | ||||
| 	bpobj_enqueue(bpo, bp, bp_freed, tx); | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| { | ||||
| @ -7657,6 +8102,14 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, | ||||
|     dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERT(!bp_freed); | ||||
| 	return (spa_free_sync_cb(arg, bp, tx)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Note: this simple function is not inlined to make it easier to dtrace the | ||||
|  * amount of time spent syncing frees. | ||||
| @ -7693,7 +8146,7 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) | ||||
| 	 */ | ||||
| 	zio_t *zio = zio_root(spa, NULL, NULL, 0); | ||||
| 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, | ||||
| 	    spa_free_sync_cb, zio, tx), ==, 0); | ||||
| 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0); | ||||
| 	VERIFY0(zio_wait(zio)); | ||||
| } | ||||
| 
 | ||||
| @ -8296,7 +8749,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) | ||||
| 			 * we sync the deferred frees later in pass 1. | ||||
| 			 */ | ||||
| 			ASSERT3U(pass, >, 1); | ||||
| 			bplist_iterate(free_bpl, bpobj_enqueue_cb, | ||||
| 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, | ||||
| 			    &spa->spa_deferred_bpobj, tx); | ||||
| 		} | ||||
| 
 | ||||
| @ -8884,4 +9337,24 @@ MODULE_PARM_DESC(zfs_max_missing_tvds, | ||||
| 	" (in read-only mode)"); | ||||
| /* END CSTYLED */ | ||||
| 
 | ||||
| module_param(zfs_livelist_condense_zthr_pause, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_condense_zthr_pause, | ||||
| 	"Set the livelist condense zthr to pause"); | ||||
| module_param(zfs_livelist_condense_sync_pause, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_condense_sync_pause, | ||||
| 	"Set the livelist condense synctask to pause"); | ||||
| 
 | ||||
| module_param(zfs_livelist_condense_sync_cancel, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_condense_sync_cancel, | ||||
| 	"Whether livelist condensing was canceled in the synctask"); | ||||
| module_param(zfs_livelist_condense_zthr_cancel, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_condense_zthr_cancel, | ||||
| 	"Whether livelist condensing was canceled in the zthr function"); | ||||
| 
 | ||||
| /* BEGIN CSTYLED */ | ||||
| module_param(zfs_livelist_condense_new_alloc, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_livelist_condense_new_alloc, | ||||
| 	"Whether extra ALLOC blkptrs were added to a livelist entry while it" | ||||
| 	" was being condensed"); | ||||
| /* END CSTYLED */ | ||||
| #endif | ||||
|  | ||||
| @ -21,7 +21,7 @@ | ||||
| 
 | ||||
| /*
 | ||||
|  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2011, 2015 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2011, 2018 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2014 Integros [integros.com] | ||||
|  * Copyright 2017 Joyent, Inc. | ||||
|  */ | ||||
| @ -413,7 +413,6 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) | ||||
| 
 | ||||
| 	/* spa_history_log_sync will free nvl */ | ||||
| 	return (err); | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | ||||
| @ -207,12 +207,15 @@ struct zthr { | ||||
| 	/* flag set to true if we are canceling the zthr */ | ||||
| 	boolean_t	zthr_cancel; | ||||
| 
 | ||||
| 	/* flag set to true if we are waiting for the zthr to finish */ | ||||
| 	boolean_t	zthr_haswaiters; | ||||
| 	kcondvar_t	zthr_wait_cv; | ||||
| 	/*
 | ||||
| 	 * maximum amount of time that the zthr is spent sleeping; | ||||
| 	 * if this is 0, the thread doesn't wake up until it gets | ||||
| 	 * signaled. | ||||
| 	 */ | ||||
| 	hrtime_t	zthr_wait_time; | ||||
| 	hrtime_t	zthr_sleep_timeout; | ||||
| 
 | ||||
| 	/* consumer-provided callbacks & data */ | ||||
| 	zthr_checkfunc_t	*zthr_checkfunc; | ||||
| @ -239,14 +242,18 @@ zthr_procedure(void *arg) | ||||
| 			 * order to prevent this process from incorrectly | ||||
| 			 * contributing to the system load average when idle. | ||||
| 			 */ | ||||
| 			if (t->zthr_wait_time == 0) { | ||||
| 			if (t->zthr_sleep_timeout == 0) { | ||||
| 				cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock); | ||||
| 			} else { | ||||
| 				(void) cv_timedwait_sig_hires(&t->zthr_cv, | ||||
| 				    &t->zthr_state_lock, t->zthr_wait_time, | ||||
| 				    &t->zthr_state_lock, t->zthr_sleep_timeout, | ||||
| 				    MSEC2NSEC(1), 0); | ||||
| 			} | ||||
| 		} | ||||
| 		if (t->zthr_haswaiters) { | ||||
| 			t->zthr_haswaiters = B_FALSE; | ||||
| 			cv_broadcast(&t->zthr_wait_cv); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -280,12 +287,13 @@ zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, | ||||
| 	mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); | ||||
| 	cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL); | ||||
| 
 | ||||
| 	mutex_enter(&t->zthr_state_lock); | ||||
| 	t->zthr_checkfunc = checkfunc; | ||||
| 	t->zthr_func = func; | ||||
| 	t->zthr_arg = arg; | ||||
| 	t->zthr_wait_time = max_sleep; | ||||
| 	t->zthr_sleep_timeout = max_sleep; | ||||
| 
 | ||||
| 	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, | ||||
| 	    0, &p0, TS_RUN, minclsyspri); | ||||
| @ -303,6 +311,7 @@ zthr_destroy(zthr_t *t) | ||||
| 	mutex_destroy(&t->zthr_request_lock); | ||||
| 	mutex_destroy(&t->zthr_state_lock); | ||||
| 	cv_destroy(&t->zthr_cv); | ||||
| 	cv_destroy(&t->zthr_wait_cv); | ||||
| 	kmem_free(t, sizeof (*t)); | ||||
| } | ||||
| 
 | ||||
| @ -355,9 +364,8 @@ zthr_cancel(zthr_t *t) | ||||
| 	 * | ||||
| 	 * [1] The thread has already been cancelled, therefore | ||||
| 	 *     there is nothing for us to do. | ||||
| 	 * [2] The thread is sleeping, so we broadcast the CV first | ||||
| 	 *     to wake it up and then we set the flag and we are | ||||
| 	 *     waiting for it to exit. | ||||
| 	 * [2] The thread is sleeping so we set the flag, broadcast | ||||
| 	 *     the CV and wait for it to exit. | ||||
| 	 * [3] The thread is doing work, in which case we just set | ||||
| 	 *     the flag and wait for it to finish. | ||||
| 	 * [4] The thread was just created/resumed, in which case | ||||
| @ -397,6 +405,7 @@ zthr_resume(zthr_t *t) | ||||
| 	ASSERT3P(&t->zthr_checkfunc, !=, NULL); | ||||
| 	ASSERT3P(&t->zthr_func, !=, NULL); | ||||
| 	ASSERT(!t->zthr_cancel); | ||||
| 	ASSERT(!t->zthr_haswaiters); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * There are 4 states that we find the zthr in at this point | ||||
| @ -451,3 +460,74 @@ zthr_iscancelled(zthr_t *t) | ||||
| 	mutex_exit(&t->zthr_state_lock); | ||||
| 	return (cancelled); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Wait for the zthr to finish its current function. Similar to | ||||
|  * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end | ||||
|  * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was | ||||
|  * sleeping or cancelled, return immediately. | ||||
|  */ | ||||
| void | ||||
| zthr_wait_cycle_done(zthr_t *t) | ||||
| { | ||||
| 	mutex_enter(&t->zthr_state_lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Since we are holding the zthr_state_lock at this point | ||||
| 	 * we can find the state in one of the following 5 states: | ||||
| 	 * | ||||
| 	 * [1] The thread has already cancelled, therefore | ||||
| 	 *     there is nothing for us to do. | ||||
| 	 * [2] The thread is sleeping so we set the flag, broadcast | ||||
| 	 *     the CV and wait for it to exit. | ||||
| 	 * [3] The thread is doing work, in which case we just set | ||||
| 	 *     the flag and wait for it to finish. | ||||
| 	 * [4] The thread was just created/resumed, in which case | ||||
| 	 *     the behavior is similar to [3]. | ||||
| 	 * [5] The thread is the middle of being cancelled, which is | ||||
| 	 *     similar to [3]. We'll wait for the cancel, which is | ||||
| 	 *     waiting for the zthr func. | ||||
| 	 * | ||||
| 	 * Since requests are serialized, by the time that we get | ||||
| 	 * control back we expect that the zthr has completed it's | ||||
| 	 * zthr_func. | ||||
| 	 */ | ||||
| 	if (t->zthr_thread != NULL) { | ||||
| 		t->zthr_haswaiters = B_TRUE; | ||||
| 
 | ||||
| 		/* broadcast in case the zthr is sleeping */ | ||||
| 		cv_broadcast(&t->zthr_cv); | ||||
| 
 | ||||
| 		while ((t->zthr_haswaiters) && (t->zthr_thread != NULL)) | ||||
| 			cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock); | ||||
| 
 | ||||
| 		ASSERT(!t->zthr_haswaiters); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_exit(&t->zthr_state_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * This function is intended to be used by the zthr itself | ||||
|  * to check if another thread is waiting on it to finish | ||||
|  * | ||||
|  * returns TRUE if we have been asked to finish. | ||||
|  * | ||||
|  * returns FALSE otherwise. | ||||
|  */ | ||||
| boolean_t | ||||
| zthr_has_waiters(zthr_t *t) | ||||
| { | ||||
| 	ASSERT3P(t->zthr_thread, ==, curthread); | ||||
| 
 | ||||
| 	mutex_enter(&t->zthr_state_lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Similarly to zthr_iscancelled(), we only grab the | ||||
| 	 * zthr_state_lock so that the zthr itself can use this | ||||
| 	 * to check for the request. | ||||
| 	 */ | ||||
| 	boolean_t has_waiters = t->zthr_haswaiters; | ||||
| 	mutex_exit(&t->zthr_state_lock); | ||||
| 	return (has_waiters); | ||||
| } | ||||
|  | ||||
| @ -147,12 +147,15 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', | ||||
| tags = ['functional', 'cli_root', 'zfs_create'] | ||||
| 
 | ||||
| [tests/functional/cli_root/zfs_destroy] | ||||
| tests = ['zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', | ||||
| tests = ['zfs_clone_livelist_condense_and_disable', | ||||
|     'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', | ||||
|     'zfs_destroy_002_pos', 'zfs_destroy_003_pos', | ||||
|     'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', | ||||
|     'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', | ||||
|     'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', | ||||
|     'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', | ||||
|     'zfs_destroy_016_pos'] | ||||
|     'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', | ||||
|     'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] | ||||
| tags = ['functional', 'cli_root', 'zfs_destroy'] | ||||
| 
 | ||||
| [tests/functional/cli_root/zfs_diff] | ||||
|  | ||||
| @ -22,7 +22,7 @@ | ||||
| # | ||||
| # Copyright 2009 Sun Microsystems, Inc.  All rights reserved. | ||||
| # Use is subject to license terms. | ||||
| # Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
| # Copyright (c) 2012, 2018 by Delphix. All rights reserved. | ||||
| # Copyright (c) 2017 by Tim Chase. All rights reserved. | ||||
| # Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved. | ||||
| # Copyright (c) 2017 Lawrence Livermore National Security, LLC. | ||||
|  | ||||
| @ -2,6 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_destro | ||||
| dist_pkgdata_SCRIPTS = \
 | ||||
| 	setup.ksh \
 | ||||
| 	cleanup.ksh \
 | ||||
| 	zfs_clone_livelist_condense_and_disable.ksh \
 | ||||
| 	zfs_clone_livelist_condense_races.ksh \
 | ||||
| 	zfs_destroy_001_pos.ksh \
 | ||||
| 	zfs_destroy_002_pos.ksh \
 | ||||
| 	zfs_destroy_003_pos.ksh \
 | ||||
| @ -17,7 +19,10 @@ dist_pkgdata_SCRIPTS = \ | ||||
| 	zfs_destroy_013_neg.ksh \
 | ||||
| 	zfs_destroy_014_pos.ksh \
 | ||||
| 	zfs_destroy_015_pos.ksh \
 | ||||
| 	zfs_destroy_016_pos.ksh | ||||
| 	zfs_destroy_016_pos.ksh \
 | ||||
| 	zfs_destroy_clone_livelist.ksh \
 | ||||
| 	zfs_destroy_dev_removal.ksh \
 | ||||
| 	zfs_destroy_dev_removal_condense.ksh | ||||
| 
 | ||||
| dist_pkgdata_DATA = \
 | ||||
| 	zfs_destroy_common.kshlib \
 | ||||
|  | ||||
| @ -0,0 +1,125 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2018 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION | ||||
| # Verify zfs destroy test for clones with the livelist feature | ||||
| # enabled. | ||||
| 
 | ||||
| # STRATEGY | ||||
| # 1. Clone where livelist is condensed | ||||
| #	- create clone, write several files, delete those files | ||||
| #	- check that the number of livelist entries decreases | ||||
| #	  after the delete | ||||
| # 2. Clone where livelist is deactivated | ||||
| #	- create clone, write files. Delete those files and the | ||||
| #	  file in the filesystem when the snapshot was created | ||||
| #	  so the clone and snapshot no longer share data | ||||
| #	- check that the livelist is destroyed | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| . $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 | ||||
| 	# reset the livelist sublist size to the original value | ||||
| 	set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX | ||||
| 	# reset the minimum percent shared to 75 | ||||
| 	set_tunable32 zfs_livelist_min_percent_shared $ORIGINAL_MIN | ||||
| } | ||||
| 
 | ||||
| function check_ll_len | ||||
| { | ||||
|     string="$(zdb -vvvvv $TESTPOOL | grep "Livelist")" | ||||
|     substring="$1" | ||||
|     msg=$2 | ||||
|     if test "${string#*$substring}" != "$string"; then | ||||
|         return 0    # $substring is in $string | ||||
|     else | ||||
| 	log_note $string | ||||
|         log_fail "$msg" # $substring is not in $string | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| function test_condense | ||||
| { | ||||
| 	# set the max livelist entries to a small value to more easily | ||||
| 	# trigger a condense | ||||
| 	set_tunable64 zfs_livelist_max_entries 0x14 | ||||
| 	# set a small percent shared threshold so the livelist is not disabled | ||||
| 	set_tunable32 zfs_livelist_min_percent_shared 0xa | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	# sync between each write to make sure a new entry is created | ||||
| 	for i in {0..4}; do | ||||
| 	    log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i | ||||
| 	    log_must zpool sync $TESTPOOL | ||||
| 	done | ||||
| 
 | ||||
| 	check_ll_len "5 entries" "Unexpected livelist size" | ||||
| 
 | ||||
| 	# sync between each write to allow for a condense of the previous entry | ||||
| 	for i in {0..4}; do | ||||
| 	    log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i | ||||
| 	    log_must zpool sync $TESTPOOL | ||||
| 	done | ||||
| 
 | ||||
| 	check_ll_len "6 entries" "Condense did not occur" | ||||
| 
 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	check_livelist_gone | ||||
| } | ||||
| 
 | ||||
| function test_deactivated | ||||
| { | ||||
| 	# Threshold set to 50 percent | ||||
| 	set_tunable32 zfs_livelist_min_percent_shared 0x32 | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0 | ||||
| 	log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1 | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	# snapshot and clone share 'atestfile', 33 percent | ||||
| 	check_livelist_gone | ||||
| 	log_must zfs destroy -R $TESTPOOL/$TESTCLONE | ||||
| 
 | ||||
| 	# Threshold set to 20 percent | ||||
| 	set_tunable32 zfs_livelist_min_percent_shared 0x14 | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0 | ||||
| 	log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1 | ||||
| 	log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE2 | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	# snapshot and clone share 'atestfile', 25 percent | ||||
| 	check_livelist_exists $TESTCLONE | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE/atestfile | ||||
| 	# snapshot and clone share no files | ||||
| 	check_livelist_gone | ||||
| 	log_must zfs destroy -R $TESTPOOL/$TESTCLONE | ||||
| } | ||||
| 
 | ||||
| ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) | ||||
| ORIGINAL_MIN=$(get_tunable zfs_livelist_min_percent_shared) | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| log_must zfs create $TESTPOOL/$TESTFS1 | ||||
| log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile | ||||
| log_must zfs snapshot $TESTPOOL/$TESTFS1@snap | ||||
| test_condense | ||||
| test_deactivated | ||||
| 
 | ||||
| log_pass "Clone's livelist condenses and disables as expected." | ||||
| @ -0,0 +1,116 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2018 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION | ||||
| # Test race conditions for livelist condensing | ||||
| 
 | ||||
| # STRATEGY | ||||
| # These tests exercise code paths that deal with a livelist being | ||||
| # simultaneously condensed and deactivated (deleted, exported or disabled). | ||||
| # If a variable is set, the zthr will pause until it is cancelled or waited | ||||
| # and then a counter variable keeps track of whether or not the code path is | ||||
| # reached. | ||||
| 
 | ||||
| # 1. Deletion race: repeatedly overwrite the same file to trigger condense | ||||
| # and then delete the clone. | ||||
| # 2. Disable race: Overwrite enough files to trigger condenses and disabling of | ||||
| # the livelist. | ||||
| # 3. Export race: repeatedly overwrite the same file to trigger condense and | ||||
| # then export the pool. | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	log_must zfs destroy -Rf $TESTPOOL/$TESTFS1 | ||||
| 	# reset the livelist sublist size to the original value | ||||
| 	set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX | ||||
| 	# reset the condense tests to 0 | ||||
| 	set_tunable32 zfs_livelist_condense_zthr_pause 0 | ||||
| 	set_tunable32 zfs_livelist_condense_sync_pause 0 | ||||
| } | ||||
| 
 | ||||
| function delete_race | ||||
| { | ||||
| 	set_tunable32 "$1" 0 | ||||
| 	log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE | ||||
| 	for i in {1..5}; do | ||||
| 		log_must zpool sync $TESTPOOL | ||||
| 		log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out | ||||
| 	done | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	[[ "1" == "$(get_tunable "$1")" ]] || \ | ||||
| 	    log_fail "delete/condense race test failed" | ||||
| } | ||||
| 
 | ||||
| function export_race | ||||
| { | ||||
| 	set_tunable32 "$1" 0 | ||||
| 	log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE | ||||
| 	for i in {1..5}; do | ||||
| 		log_must zpool sync $TESTPOOL | ||||
| 		log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out | ||||
| 	done | ||||
| 	log_must zpool export $TESTPOOL | ||||
| 	log_must zpool import $TESTPOOL | ||||
| 	[[ "1" == "$(get_tunable "$1")" ]] || \ | ||||
| 	    log_fail "export/condense race test failed" | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| } | ||||
| 
 | ||||
| function disable_race | ||||
| { | ||||
| 	set_tunable32 "$1" 0 | ||||
| 	log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE | ||||
| 	for i in {1..5}; do | ||||
| 		log_must zpool sync $TESTPOOL | ||||
| 		log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out | ||||
| 	done | ||||
| 	# overwrite the file shared with the origin to trigger disable | ||||
| 	log_must mkfile 100m /$TESTPOOL/$TESTCLONE/atestfile | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| 	[[ "1" == "$(get_tunable "$1")" ]] || \ | ||||
| 	    log_fail "disable/condense race test failed" | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| } | ||||
| 
 | ||||
| ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| log_must zfs create $TESTPOOL/$TESTFS1 | ||||
| log_must mkfile 100m /$TESTPOOL/$TESTFS1/atestfile | ||||
| log_must zpool sync $TESTPOOL | ||||
| log_must zfs snapshot $TESTPOOL/$TESTFS1@snap | ||||
| 
 | ||||
| # Reduce livelist size to trigger condense more easily | ||||
| set_tunable64 zfs_livelist_max_entries 0x14 | ||||
| 
 | ||||
| # Test cancellation path in the zthr | ||||
| set_tunable32 zfs_livelist_condense_zthr_pause 1 | ||||
| set_tunable32 zfs_livelist_condense_sync_pause 0 | ||||
| disable_race "zfs_livelist_condense_zthr_cancel" | ||||
| delete_race "zfs_livelist_condense_zthr_cancel" | ||||
| export_race "zfs_livelist_condense_zthr_cancel" | ||||
| 
 | ||||
| # Test cancellation path in the synctask | ||||
| set_tunable32 zfs_livelist_condense_zthr_pause 0 | ||||
| set_tunable32 zfs_livelist_condense_sync_pause 1 | ||||
| disable_race "zfs_livelist_condense_sync_cancel" | ||||
| delete_race "zfs_livelist_condense_sync_cancel" | ||||
| 
 | ||||
| log_pass "Clone livelist condense race conditions passed." | ||||
| @ -0,0 +1,140 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2018 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION | ||||
| # Verify zfs destroy test for clones with the livelist feature | ||||
| # enabled. | ||||
| 
 | ||||
| # STRATEGY | ||||
| # 1. One clone with an empty livelist | ||||
| #	- create the clone, check that livelist exists | ||||
| #	- delete the clone, check that livelist is eventually | ||||
| #	  destroyed | ||||
| # 2. One clone with populated livelist | ||||
| #	- create the clone, check that livelist exists | ||||
| #	- write multiple files to the clone | ||||
| #	- delete the clone, check that livelist is eventually | ||||
| #	  destroyed | ||||
| # 3. Multiple clones with empty livelists | ||||
| #	- same as 1. but with multiple clones | ||||
| # 4. Multuple clones with populated livelists | ||||
| #	- same as 2. but with multiple clones | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| . $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	datasetexists $TESTPOOL/$TESTFS1 && zfs destroy -R $TESTPOOL/$TESTFS1 | ||||
| 	# reset the livelist sublist size to its original value | ||||
| 	set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX | ||||
| } | ||||
| 
 | ||||
| function clone_write_file | ||||
| { | ||||
| 	log_must mkfile 1m /$TESTPOOL/$1/$2 | ||||
| 	log_must zpool sync $TESTPOOL | ||||
| } | ||||
| 
 | ||||
| function test_one_empty | ||||
| { | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	check_livelist_gone | ||||
| } | ||||
| 
 | ||||
| function test_one | ||||
| { | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	clone_write_file $TESTCLONE $TESTFILE0 | ||||
| 	clone_write_file $TESTCLONE $TESTFILE1 | ||||
| 	clone_write_file $TESTCLONE $TESTFILE2 | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE0 | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE2 | ||||
| 	check_livelist_exists $TESTCLONE | ||||
| 
 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	check_livelist_gone | ||||
| } | ||||
| 
 | ||||
| function test_multiple_empty | ||||
| { | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE1 | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE2 | ||||
| 
 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE1 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE2 | ||||
| 	check_livelist_gone | ||||
| } | ||||
| 
 | ||||
| function test_multiple | ||||
| { | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE1 | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE2 | ||||
| 
 | ||||
| 	clone_write_file $TESTCLONE $TESTFILE0 | ||||
| 
 | ||||
| 	clone_write_file $TESTCLONE1 $TESTFILE0 | ||||
| 	clone_write_file $TESTCLONE1 $TESTFILE1 | ||||
| 	clone_write_file $TESTCLONE1 $TESTFILE2 | ||||
| 
 | ||||
| 	clone_write_file $TESTCLONE2 $TESTFILE0 | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE0 | ||||
| 	clone_write_file $TESTCLONE2 $TESTFILE1 | ||||
| 	log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE1 | ||||
| 
 | ||||
| 	check_livelist_exists $TESTCLONE | ||||
| 	check_livelist_exists $TESTCLONE1 | ||||
| 	check_livelist_exists $TESTCLONE2 | ||||
| 
 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE1 | ||||
| 	log_must zfs destroy $TESTPOOL/$TESTCLONE2 | ||||
| 	check_livelist_gone | ||||
| } | ||||
| 
 | ||||
| function test_promote | ||||
| { | ||||
| 	clone_dataset $TESTFS1 snap $TESTCLONE | ||||
| 
 | ||||
| 	log_must zfs promote $TESTPOOL/$TESTCLONE | ||||
| 	check_livelist_gone | ||||
| 	log_must zfs destroy -R $TESTPOOL/$TESTCLONE | ||||
| } | ||||
| 
 | ||||
| ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| log_must zfs create $TESTPOOL/$TESTFS1 | ||||
| log_must mkfile 20m /$TESTPOOL/$TESTFS1/atestfile | ||||
| log_must zfs snapshot $TESTPOOL/$TESTFS1@snap | ||||
| 
 | ||||
| # set a small livelist entry size to more easily test multiple entry livelists | ||||
| set_tunable64 zfs_livelist_max_entries 0x14 | ||||
| 
 | ||||
| test_one_empty | ||||
| test_one | ||||
| test_multiple_empty | ||||
| test_multiple | ||||
| test_promote | ||||
| 
 | ||||
| log_pass "Clone with the livelist feature enabled could be destroyed," \ | ||||
| 	"also could be promoted and destroyed as expected." | ||||
| @ -25,7 +25,7 @@ | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
| # Copyright (c) 2012, 2018 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| @ -146,3 +146,43 @@ function check_dataset | ||||
| 		done | ||||
| 	fi | ||||
| } | ||||
| 
 | ||||
| # Use zdb to see if a livelist exists for a given clone | ||||
| # $1   clone name | ||||
| function check_livelist_exists | ||||
| { | ||||
| 	zdb -vvvvv $TESTPOOL/$1 | grep "Livelist" || \ | ||||
| 		log_fail "zdb could not find Livelist" | ||||
| } | ||||
| 
 | ||||
| # Wait for the deferred destroy livelists to be removed | ||||
| function wait_for_deferred_destroy | ||||
| { | ||||
| 	sync | ||||
| 	deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist") | ||||
| 	while [[ "$deleted" != "" ]]; do | ||||
| 		deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist") | ||||
| 	done | ||||
| } | ||||
| 
 | ||||
| # Check that a livelist has been removed, waiting for deferred destroy entries | ||||
| # to be cleared from zdb. | ||||
| function check_livelist_gone | ||||
| { | ||||
| 	wait_for_deferred_destroy | ||||
| 	zdb -vvvvv $TESTPOOL | grep "Livelist" && \ | ||||
| 		log_fail "zdb found Livelist after the clone is deleted." | ||||
| } | ||||
| 
 | ||||
| # Create a clone in the testpool based on $TESTFS@snap. Verify that the clone | ||||
| # was created and that it includes a livelist | ||||
| # $1    fs name | ||||
| # $2    snap name | ||||
| # $3    clone name | ||||
| function clone_dataset | ||||
| { | ||||
| 	log_must zfs clone $TESTPOOL/$1@$2 $TESTPOOL/$3 | ||||
| 	datasetexists $TESTPOOL/$3 || \ | ||||
| 		log_fail "zfs clone $TESTPOOL/$3 fail." | ||||
| 	check_livelist_exists $3 | ||||
| } | ||||
|  | ||||
| @ -0,0 +1,68 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2018 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION | ||||
| # Verify that livelists tracking remapped blocks can be | ||||
| # properly destroyed. | ||||
| 
 | ||||
| # STRATEGY | ||||
| # 1. Create a pool with disk1 and create a filesystem, snapshot | ||||
| # and clone. Write several files to the clone. | ||||
| # 2. Add disk2 to the pool and then remove disk1, triggering a | ||||
| # remap of the blkptrs tracked in the livelist. | ||||
| # 3. Delete the clone | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| . $STF_SUITE/tests/functional/removal/removal.kshlib | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 | ||||
| 	[[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1 | ||||
| 	[[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2 | ||||
| } | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| VIRTUAL_DISK1=/var/tmp/disk1 | ||||
| VIRTUAL_DISK2=/var/tmp/disk2 | ||||
| log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1 | ||||
| log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2 | ||||
| 
 | ||||
| log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1 | ||||
| log_must poolexists $TESTPOOL2 | ||||
| 
 | ||||
| log_must zfs create $TESTPOOL2/$TESTFS | ||||
| log_must mkfile 25m /$TESTPOOL2/$TESTFS/atestfile | ||||
| log_must zfs snapshot $TESTPOOL2/$TESTFS@snap | ||||
| 
 | ||||
| log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE | ||||
| 
 | ||||
| log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE0 | ||||
| log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE1 | ||||
| log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE2 | ||||
| 
 | ||||
| log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2 | ||||
| log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1 | ||||
| wait_for_removal $TESTPOOL2 | ||||
| 
 | ||||
| log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE0 | ||||
| log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE1 | ||||
| 
 | ||||
| log_must zfs destroy $TESTPOOL2/$TESTCLONE | ||||
| 
 | ||||
| log_pass "Clone with the livelist feature and remapped blocks," \ | ||||
| 	"can be destroyed." | ||||
| @ -0,0 +1,93 @@ | ||||
| #!/bin/ksh -p | ||||
| # | ||||
| # This file and its contents are supplied under the terms of the | ||||
| # Common Development and Distribution License ("CDDL"), version 1.0. | ||||
| # You may only use this file in accordance with the terms of version | ||||
| # 1.0 of the CDDL. | ||||
| # | ||||
| # A full copy of the text of the CDDL should have accompanied this | ||||
| # source.  A copy of the CDDL is also available via the Internet at | ||||
| # http://www.illumos.org/license/CDDL. | ||||
| # | ||||
| 
 | ||||
| # | ||||
| # Copyright (c) 2018 by Delphix. All rights reserved. | ||||
| # | ||||
| 
 | ||||
| # DESCRIPTION | ||||
| # Verify that livelists tracking remapped blocks can be | ||||
| # properly condensed. | ||||
| 
 | ||||
| # STRATEGY | ||||
| # 1. Create a pool with disk1 and create a filesystem, snapshot | ||||
| # and clone. Create two files for the first livelist entry and | ||||
| # pause condensing. | ||||
| # 2. Add disk2 to the pool and then remove disk1, triggering a | ||||
| # remap of the blkptrs tracked in the livelist. | ||||
| # 3. Overwrite the first file several times to trigger a condense, | ||||
| # overwrite the second file once and resume condensing, now with | ||||
| # extra blkptrs added during the remap | ||||
| # 4. Check that the test added new ALLOC blkptrs mid-condense using | ||||
| # a variable set in that code path | ||||
| 
 | ||||
| . $STF_SUITE/include/libtest.shlib | ||||
| . $STF_SUITE/tests/functional/removal/removal.kshlib | ||||
| 
 | ||||
| function cleanup | ||||
| { | ||||
| 	poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2 | ||||
| 	# reset livelist max size | ||||
| 	set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX | ||||
| 	[[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1 | ||||
| 	[[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2 | ||||
| } | ||||
| 
 | ||||
| log_onexit cleanup | ||||
| 
 | ||||
| ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries) | ||||
| set_tunable64 zfs_livelist_max_entries 0x14 | ||||
| 
 | ||||
| VIRTUAL_DISK1=/var/tmp/disk1 | ||||
| VIRTUAL_DISK2=/var/tmp/disk2 | ||||
| log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1 | ||||
| log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2 | ||||
| 
 | ||||
| log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1 | ||||
| log_must poolexists $TESTPOOL2 | ||||
| 
 | ||||
| log_must zfs create $TESTPOOL2/$TESTFS | ||||
| log_must mkfile 100m /$TESTPOOL2/$TESTFS/atestfile | ||||
| log_must zfs snapshot $TESTPOOL2/$TESTFS@snap | ||||
| 
 | ||||
| log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE | ||||
| 
 | ||||
| # Create inital files and pause condense zthr on next execution | ||||
| log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A | ||||
| log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B | ||||
| log_must zpool sync $TESTPOOL2 | ||||
| set_tunable32 zfs_livelist_condense_sync_pause 1 | ||||
| 
 | ||||
| # Add a new dev and remove the old one | ||||
| log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2 | ||||
| log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1 | ||||
| wait_for_removal $TESTPOOL2 | ||||
| 
 | ||||
| set_tunable32 zfs_livelist_condense_new_alloc 0 | ||||
| # Trigger a condense | ||||
| log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A | ||||
| log_must zpool sync $TESTPOOL2 | ||||
| log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A | ||||
| log_must zpool sync $TESTPOOL2 | ||||
| # Write remapped blkptrs which will modify the livelist mid-condense | ||||
| log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B | ||||
| 
 | ||||
| # Resume condense thr | ||||
| set_tunable32 zfs_livelist_condense_sync_pause 0 | ||||
| log_must zpool sync $TESTPOOL2 | ||||
| # Check that we've added new ALLOC blkptrs during the condense | ||||
| [[ "0" < "$(get_tunable zfs_livelist_condense_new_alloc)" ]] || \ | ||||
|     log_fail "removal/condense test failed" | ||||
| 
 | ||||
| log_must zfs destroy $TESTPOOL2/$TESTCLONE | ||||
| log_pass "Clone with the livelist feature and remapped blocks," \ | ||||
| 	"can be condensed." | ||||
| @ -93,5 +93,6 @@ if is_linux; then | ||||
| 	    "feature@allocation_classes" | ||||
| 	    "feature@resilver_defer" | ||||
| 	    "feature@bookmark_v2" | ||||
| 	    "feature@livelist" | ||||
| 	) | ||||
| fi | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Sara Hartse
						Sara Hartse