Fast Clone Deletion

Deleting a clone requires finding blocks are clone-only, not shared with the snapshot. This was done by traversing the entire block tree which results in a large performance penalty for sparsely written clones. This is new method keeps track of clone blocks when they are modified in a "Livelist" so that, when it’s time to delete, the clone-specific blocks are already at hand. We see performance improvements because now deletion work is proportional to the number of clone-modified blocks, not the size of the original dataset. Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Signed-off-by: Sara Hartse <sara.hartse@delphix.com> Closes #8416
2026-05-22 02:27:36 +03:00 · 2019-07-26 10:54:14 -07:00
parent d274ac5460
commit 37f03da8ba
38 changed files with 2583 additions and 205 deletions
@@ -348,6 +348,18 @@ zpool_feature_init(void)
 	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
 	    ZFEATURE_TYPE_BOOLEAN, NULL);

+	{
+	static const spa_feature_t livelist_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LIVELIST,
+	    "com.delphix:livelist", "livelist",
+	    "Improved clone deletion performance.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+	    livelist_deps);
+	}
+
 	{
 	static const spa_feature_t log_spacemap_deps[] = {
 		SPA_FEATURE_SPACEMAP_V2,
@@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 */

 #include <sys/bplist.h>
@@ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 	}
 	mutex_exit(&bpl->bpl_lock);
 }
+
+void
+bplist_clear(bplist_t *bpl)
+{
+	bplist_entry_t *bpe;
+
+	mutex_enter(&bpl->bpl_lock);
+	while ((bpe = list_head(&bpl->bpl_list))) {
+		bplist_iterate_last_removed = bpe;
+		list_remove(&bpl->bpl_list, bpe);
+		kmem_free(bpe, sizeof (*bpe));
+	}
+	mutex_exit(&bpl->bpl_lock);
+}
@@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2017 Datto Inc.
 */

@@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 		size = BPOBJ_SIZE_V0;
 	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 		size = BPOBJ_SIZE_V1;
+	else if (!spa_feature_is_active(dmu_objset_spa(os),
+	    SPA_FEATURE_LIVELIST))
+		size = BPOBJ_SIZE_V2;
 	else
 		size = sizeof (bpobj_phys_t);

@@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
 	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
 	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
 	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+	bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
 	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
 	return (0);
 }
@@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
 * Update bpobj and all of its parents with new space accounting.
 */
 static void
-propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
-    uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
+propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
+    int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
 {

 	for (; bpi != NULL; bpi = bpi->bpi_parent) {
@@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,

 static int
 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
-    dmu_tx_t *tx, boolean_t free)
+    int64_t start, dmu_tx_t *tx, boolean_t free)
 {
 	int err = 0;
-	uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
+	int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
 	dmu_buf_t *dbuf = NULL;
 	bpobj_t *bpo = bpi->bpi_bpo;

-	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
 		uint64_t offset = i * sizeof (blkptr_t);
 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);

 		if (dbuf == NULL || dbuf->db_offset > offset) {
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
-			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
-			    FTAG, &dbuf, 0);
+			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+			    offset, FTAG, &dbuf, 0);
 			if (err)
 				break;
 		}
@@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,

 		blkptr_t *bparray = dbuf->db_data;
 		blkptr_t *bp = &bparray[blkoff];
-		err = func(arg, bp, tx);
+
+		boolean_t bp_freed = BP_GET_FREE(bp);
+		err = func(arg, bp, bp_freed, tx);
 		if (err)
 			break;

 		if (free) {
+			int sign = bp_freed ? -1 : +1;
 			spa_t *spa = dmu_objset_spa(bpo->bpo_os);
-			freed += bp_get_dsize_sync(spa, bp);
-			comp_freed += BP_GET_PSIZE(bp);
-			uncomp_freed += BP_GET_UCSIZE(bp);
+			freed += sign * bp_get_dsize_sync(spa, bp);
+			comp_freed += sign * BP_GET_PSIZE(bp);
+			uncomp_freed += sign * BP_GET_UCSIZE(bp);
 			ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
 			bpo->bpo_phys->bpo_num_blkptrs--;
 			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+			if (bp_freed) {
+				ASSERT(bpo->bpo_havefreed);
+				bpo->bpo_phys->bpo_num_freed--;
+				ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
+			}
 		}
 	}
 	if (free) {
@@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 */
 static int
 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
-    dmu_tx_t *tx, boolean_t free)
+    dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
 {
 	list_t stack;
 	bpobj_info_t *bpi;
@@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 	list_create(&stack, sizeof (bpobj_info_t),
 	    offsetof(bpobj_info_t, bpi_node));
 	mutex_enter(&initial_bpo->bpo_lock);
+
+	if (bpobj_size != NULL)
+		*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
+
 	list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));

 	while ((bpi = list_head(&stack)) != NULL) {
@@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 			dmu_buf_will_dirty(bpo->bpo_dbuf, tx);

 		if (bpi->bpi_visited == B_FALSE) {
-			err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
+			err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
+			    free);
 			bpi->bpi_visited = B_TRUE;
 			if (err != 0)
 				break;
@@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 			 * We have unprocessed subobjs. Process the next one.
 			 */
 			ASSERT(bpo->bpo_havecomp);
+			ASSERT3P(bpobj_size, ==, NULL);

 			/* Add the last subobj to stack. */
 			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
@@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 int
 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
 {
-	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
 }

 /*
 * Iterate the entries.  If func returns nonzero, iteration will stop.
+ *
+ * If there are no subobjs:
+ *
+ * *bpobj_size can be used to return the number of block pointers in the
+ * bpobj.  Note that this may be different from the number of block pointers
+ * that are iterated over, if iteration is terminated early (e.g. by the func
+ * returning nonzero).
+ *
+ * If there are concurrent (or subsequent) modifications to the bpobj then the
+ * returned *bpobj_size can be passed as "start" to
+ * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
 */
 int
-bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+    uint64_t *bpobj_size)
 {
-	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+	return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
+}
+
+/*
+ * Iterate over the blkptrs in the bpobj beginning at index start. If func
+ * returns nonzero, iteration will stop. This is a livelist specific function
+ * since it assumes that there are no subobjs present.
+ */
+int
+livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+    int64_t start)
+{
+	if (bpo->bpo_havesubobj)
+		VERIFY0(bpo->bpo_phys->bpo_subobjs);
+	bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
+	int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
+	kmem_free(bpi, sizeof (bpobj_info_t));
+	return (err);
 }

 /*
@@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 }

 void
-bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
 {
 	blkptr_t stored_bp = *bp;
 	uint64_t offset;
@@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
 		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
 	}

-	/* We never need the fill count. */
 	stored_bp.blk_fill = 0;
+	BP_SET_FREE(&stored_bp, bp_freed);

 	mutex_enter(&bpo->bpo_lock);

@@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)

 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	bpo->bpo_phys->bpo_num_blkptrs++;
-	bpo->bpo_phys->bpo_bytes +=
+	int sign = bp_freed ? -1 : +1;
+	bpo->bpo_phys->bpo_bytes += sign *
 	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
 	if (bpo->bpo_havecomp) {
-		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
-		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+		bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
+		bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
+	}
+	if (bp_freed) {
+		ASSERT(bpo->bpo_havefreed);
+		bpo->bpo_phys->bpo_num_freed++;
 	}
 	mutex_exit(&bpo->bpo_lock);
 }
@@ -799,7 +852,7 @@ struct space_range_arg {

 /* ARGSUSED */
 static int
-space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	struct space_range_arg *sra = arg;

@@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
 	*uncompp = sra.uncomp;
 	return (err);
 }
+
+/*
+ * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
+ * bpobj are designated as free or allocated that information is not preserved
+ * in bplists.
+ */
+/* ARGSUSED */
+int
+bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	bplist_t *bpl = arg;
+	bplist_append(bpl, bp);
+	return (0);
+}
@@ -3286,6 +3286,13 @@ dbuf_hold_impl_arg(struct dbuf_hold_arg *dh)

 	*(dh->dh_dbp) = NULL;

+	/* If the pool has been created, verify the tx_sync_lock is not held */
+	spa_t *spa = dh->dh_dn->dn_objset->os_spa;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	if (dp != NULL) {
+		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
+	}
+
 	/* dbuf_find() returns with db_mtx held */
 	dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
 	    dh->dh_level, dh->dh_blkid);
@@ -4479,6 +4486,29 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
+		/*
+		 * If the blkptr being remapped is tracked by a livelist,
+		 * then we need to make sure the livelist reflects the update.
+		 * First, cancel out the old blkptr by appending a 'FREE'
+		 * entry. Next, add an 'ALLOC' to track the new version. This
+		 * way we avoid trying to free an inaccurate blkptr at delete.
+		 * Note that embedded blkptrs are not tracked in livelists.
+		 */
+		if (dn->dn_objset != spa_meta_objset(spa)) {
+			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
+			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+				ASSERT(!BP_IS_EMBEDDED(bp));
+				ASSERT(dsl_dir_is_clone(ds->ds_dir));
+				ASSERT(spa_feature_is_enabled(spa,
+				    SPA_FEATURE_LIVELIST));
+				bplist_append(&ds->ds_dir->dd_pending_frees,
+				    bp);
+				bplist_append(&ds->ds_dir->dd_pending_allocs,
+				    &bp_copy);
+			}
+		}
+
 		/*
 		 * The db_rwlock prevents dbuf_read_impl() from
 		 * dereferencing the BP while we are changing it.  To
@@ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
-	int used, compressed, uncompressed;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	int used = bp_get_dsize_sync(spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;

-	used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
-	compressed = BP_GET_PSIZE(bp);
-	uncompressed = BP_GET_UCSIZE(bp);
-
 	dprintf_bp(bp, "ds=%p", ds);

 	ASSERT(dmu_tx_is_syncing(tx));
@@ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		ds->ds_feature_activation[f] = (void *)B_TRUE;
 	}

+	/*
+	 * Track block for livelist, but ignore embedded blocks because
+	 * they do not need to be freed.
+	 */
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    !(BP_IS_EMBEDDED(bp))) {
+		ASSERT(dsl_dir_is_clone(ds->ds_dir));
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LIVELIST));
+		bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
+	}
+
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
@@ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
 		DVA_SET_VDEV(dva, vdev);
 		DVA_SET_OFFSET(dva, offset);
 		DVA_SET_ASIZE(dva, size);
-
-		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
+		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
+		    tx);
 	}
 }

@@ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	ASSERT(!ds->ds_is_snapshot);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);

+	/*
+	 * Track block for livelist, but ignore embedded blocks because
+	 * they do not need to be freed.
+	 */
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    !(BP_IS_EMBEDDED(bp))) {
+		ASSERT(dsl_dir_is_clone(ds->ds_dir));
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LIVELIST));
+		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
+	}
+
 	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;

@@ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 			 */
 			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
-			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+			dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
@@ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,

 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(lastname[0] != '@');
+	/*
+	 * Filesystems will eventually have their origin set to dp_origin_snap,
+	 * but that's taken care of in dsl_dataset_create_sync_dd. When
+	 * creating a filesystem, this function is called with origin equal to
+	 * NULL.
+	 */
+	if (origin != NULL)
+		ASSERT3P(origin, !=, dp->dp_origin_snap);

 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
@@ -1250,6 +1283,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,

 	dsl_deleg_set_create_perms(dd, tx, cr);

+	/*
+	 * If we are creating a clone and the livelist feature is enabled,
+	 * add the entry DD_FIELD_LIVELIST to ZAP.
+	 */
+	if (origin != NULL &&
+	    spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
+		objset_t *mos = dd->dd_pool->dp_meta_objset;
+		dsl_dir_zapify(dd, tx);
+		uint64_t obj = dsl_deadlist_alloc(mos, tx);
+		VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
+		    sizeof (uint64_t), 1, &obj, tx));
+		spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
+	}
+
 	/*
 	 * Since we're creating a new node we know it's a leaf, so we can
 	 * initialize the counts if the limit feature is active.
@@ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	}
 }

-static int
-deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+/*
+ * Check if the percentage of blocks shared between the clone and the
+ * snapshot (as opposed to those that are clone only) is below a certain
+ * threshold
+ */
+boolean_t
+dsl_livelist_should_disable(dsl_dataset_t *ds)
 {
-	dsl_deadlist_t *dl = arg;
-	dsl_deadlist_insert(dl, bp, tx);
-	return (0);
+	uint64_t used, referenced;
+	int percent_shared;
+
+	used = dsl_dir_get_usedds(ds->ds_dir);
+	referenced = dsl_get_referenced(ds);
+	ASSERT3U(referenced, >=, 0);
+	ASSERT3U(used, >=, 0);
+	if (referenced == 0)
+		return (B_FALSE);
+	percent_shared = (100 * (referenced - used)) / referenced;
+	if (percent_shared <= zfs_livelist_min_percent_shared)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ *  Check if it is possible to combine two livelist entries into one.
+ *  This is the case if the combined number of 'live' blkptrs (ALLOCs that
+ *  don't have a matching FREE) is under the maximum sublist size.
+ *  We check this by subtracting twice the total number of frees from the total
+ *  number of blkptrs. FREEs are counted twice because each FREE blkptr
+ *  will cancel out an ALLOC blkptr when the livelist is processed.
+ */
+static boolean_t
+dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
+    dsl_deadlist_entry_t *next)
+{
+	uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
+	    next->dle_bpobj.bpo_phys->bpo_num_freed;
+	uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
+	    next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+	if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+typedef struct try_condense_arg {
+	spa_t *spa;
+	dsl_dataset_t *ds;
+} try_condense_arg_t;
+
+/*
+ * Iterate over the livelist entries, searching for a pair to condense.
+ * A nonzero return value means stop, 0 means keep looking.
+ */
+static int
+dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
+{
+	try_condense_arg_t *tca = arg;
+	spa_t *spa = tca->spa;
+	dsl_dataset_t *ds = tca->ds;
+	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+	dsl_deadlist_entry_t *next;
+
+	/* The condense thread has not yet been created at import */
+	if (spa->spa_livelist_condense_zthr == NULL)
+		return (1);
+
+	/* A condense is already in progress */
+	if (spa->spa_to_condense.ds != NULL)
+		return (1);
+
+	next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
+	/* The livelist has only one entry - don't condense it */
+	if (next == NULL)
+		return (1);
+
+	/* Next is the newest entry - don't condense it */
+	if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
+		return (1);
+
+	/* This pair is not ready to condense but keep looking */
+	if (!dsl_livelist_should_condense(first, next))
+		return (0);
+
+	/*
+	 * Add a ref to prevent the dataset from being evicted while
+	 * the condense zthr or synctask are running. Ref will be
+	 * released at the end of the condense synctask
+	 */
+	dmu_buf_add_ref(ds->ds_dbuf, spa);
+
+	spa->spa_to_condense.ds = ds;
+	spa->spa_to_condense.first = first;
+	spa->spa_to_condense.next = next;
+	spa->spa_to_condense.syncing = B_FALSE;
+	spa->spa_to_condense.cancelled = B_FALSE;
+
+	zthr_wakeup(spa->spa_livelist_condense_zthr);
+	return (1);
+}
+
+static void
+dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+	dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
+
+	/* Check if we need to add a new sub-livelist */
+	if (last == NULL) {
+		/* The livelist is empty */
+		dsl_deadlist_add_key(&dd->dd_livelist,
+		    tx->tx_txg - 1, tx);
+	} else if (spa_sync_pass(spa) == 1) {
+		/*
+		 * Check if the newest entry is full. If it is, make a new one.
+		 * We only do this once per sync because we could overfill a
+		 * sublist in one sync pass and don't want to add another entry
+		 * for a txg that is already represented. This ensures that
+		 * blkptrs born in the same txg are stored in the same sublist.
+		 */
+		bpobj_t bpobj = last->dle_bpobj;
+		uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
+		uint64_t free = bpobj.bpo_phys->bpo_num_freed;
+		uint64_t alloc = all - free;
+		if (alloc > zfs_livelist_max_entries) {
+			dsl_deadlist_add_key(&dd->dd_livelist,
+			    tx->tx_txg - 1, tx);
+		}
+	}
+
+	/* Insert each entry into the on-disk livelist */
+	bplist_iterate(&dd->dd_pending_allocs,
+	    dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
+	bplist_iterate(&dd->dd_pending_frees,
+	    dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
+
+	/* Attempt to condense every pair of adjacent entries */
+	try_condense_arg_t arg = {
+	    .spa = spa,
+	    .ds = ds
+	};
+	dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
+	    &arg);
 }

 void
@@ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 	objset_t *os = ds->ds_objset;

 	bplist_iterate(&ds->ds_pending_deadlist,
-	    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+	    dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
+
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+		dsl_flush_pending_livelist(ds, tx);
+		if (dsl_livelist_should_disable(ds)) {
+			dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
+		}
+	}

 	dsl_bookmark_sync_done(ds, tx);

@@ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 	uint64_t oldnext_obj;
 	int64_t delta;

+	ASSERT(nvlist_empty(ddpa->err_ds));
+
 	VERIFY0(promote_hold(ddpa, dp, FTAG));
 	hds = ddpa->ddpa_clone;

@@ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)

 	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;

+	/*
+	 * Since livelists are specific to a clone's origin txg, they
+	 * are no longer accurate. Destroy the livelist from the clone being
+	 * promoted. If the origin dataset is a clone, destroy its livelist
+	 * as well.
+	 */
+	dsl_dir_remove_livelist(dd, tx, B_TRUE);
+	dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE);
+
 	/* log history record */
 	spa_history_log_internal_ds(hds, "promote", tx, "");

@@ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,

 	dsl_scan_ds_clone_swapped(origin_head, clone, tx);

+	/*
+	 * Destroy any livelists associated with the clone or the origin,
+	 * since after the swap the corresponding livelists are no longer
+	 * valid.
+	 */
+	dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
+	dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
+
 	spa_history_log_internal_ds(clone, "clone swap", tx,
 	    "parent=%s", origin_head->ds_dir->dd_myname);
 }
@@ -20,16 +20,16 @@
 */
 /*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
 */

-#include <sys/dsl_dataset.h>
 #include <sys/dmu.h>
 #include <sys/refcount.h>
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>

 /*
 * Deadlist concurrency:
@@ -51,6 +51,68 @@
 * provides its own locking, and dl_oldfmt is immutable.
 */

+/*
+ * Livelist Overview
+ * ================
+ *
+ * Livelists use the same 'deadlist_t' struct as deadlists and are also used
+ * to track blkptrs over the lifetime of a dataset. Livelists however, belong
+ * to clones and track the blkptrs that are clone-specific (were born after
+ * the clone's creation). The exception is embedded block pointers which are
+ * not included in livelists because they do not need to be freed.
+ *
+ * When it comes time to delete the clone, the livelist provides a quick
+ * reference as to what needs to be freed. For this reason, livelists also track
+ * when clone-specific blkptrs are freed before deletion to prevent double
+ * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
+ * deletion algorithm iterates backwards over the livelist, matching
+ * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
+ * are also updated in the case when blkptrs are remapped: the old version
+ * of the blkptr is cancelled out with a FREE and the new version is tracked
+ * with an ALLOC.
+ *
+ * To bound the amount of memory required for deletion, livelists over a
+ * certain size are spread over multiple entries. Entries are grouped by
+ * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
+ * be in the same entry. This allows us to delete livelists incrementally
+ * over multiple syncs, one entry at a time.
+ *
+ * During the lifetime of the clone, livelists can get extremely large.
+ * Their size is managed by periodic condensing (preemptively cancelling out
+ * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
+ * the shared space between the clone and its origin is so small that it
+ * doesn't make sense to use livelists anymore.
+ */
+
+/*
+ * The threshold sublist size at which we create a new sub-livelist for the
+ * next txg. However, since blkptrs of the same transaction group must be in
+ * the same sub-list, the actual sublist size may exceed this. When picking the
+ * size we had to balance the fact that larger sublists mean fewer sublists
+ * (decreasing the cost of insertion) against the consideration that sublists
+ * will be loaded into memory and shouldn't take up an inordinate amount of
+ * space. We settled on ~500000 entries, corresponding to roughly 128M.
+ */
+unsigned long zfs_livelist_max_entries = 500000;
+
+/*
+ * We can approximate how much of a performance gain a livelist will give us
+ * based on the percentage of blocks shared between the clone and its origin.
+ * 0 percent shared means that the clone has completely diverged and that the
+ * old method is maximally effective: every read from the block tree will
+ * result in lots of frees. Livelists give us gains when they track blocks
+ * scattered across the tree, when one read in the old method might only
+ * result in a few frees. Once the clone has been overwritten enough,
+ * writes are no longer sparse and we'll no longer get much of a benefit from
+ * tracking them with a livelist. We chose a lower limit of 75 percent shared
+ * (25 percent overwritten). This means that 1/4 of all block pointers will be
+ * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
+ * to make deletion 4x faster. Once the amount of shared space drops below this
+ * threshold, the clone will revert to the old deletion method.
+ */
+int zfs_livelist_min_percent_shared = 75;
+
+
 static int
 dsl_deadlist_compare(const void *arg1, const void *arg2)
 {
@@ -88,6 +150,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
 	dl->dl_havetree = B_TRUE;
 }

+void
+dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
+{
+	dsl_deadlist_entry_t *dle;
+
+	ASSERT(dsl_deadlist_is_open(dl));
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	mutex_exit(&dl->dl_lock);
+	for (dle = avl_first(&dl->dl_tree); dle != NULL;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		if (func(args, dle) != 0)
+			break;
+	}
+}
+
 void
 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
 {
@@ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)

 static void
 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
-    const blkptr_t *bp, dmu_tx_t *tx)
+    const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	ASSERT(MUTEX_HELD(&dl->dl_lock));
 	if (dle->dle_bpobj.bpo_object ==
@@ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
 		    dle->dle_mintxg, obj, tx));
 	}
-	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+	bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
 }

 static void
@@ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 }

 void
-dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
 {
 	dsl_deadlist_entry_t dle_tofind;
 	dsl_deadlist_entry_t *dle;
 	avl_index_t where;

 	if (dl->dl_oldfmt) {
-		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+		bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
 		return;
 	}

@@ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 	dsl_deadlist_load_tree(dl);

 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+
+	int sign = bp_freed ? -1 : +1;
 	dl->dl_phys->dl_used +=
-	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
-	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
-	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+	    sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
+	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);

 	dle_tofind.dle_mintxg = bp->blk_birth;
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
@@ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 	}

 	ASSERT3P(dle, !=, NULL);
-	dle_enqueue(dl, dle, bp, tx);
+	dle_enqueue(dl, dle, bp, bp_freed, tx);
 	mutex_exit(&dl->dl_lock);
 }

+int
+dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, B_FALSE, tx);
+	return (0);
+}
+
+int
+dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, B_TRUE, tx);
+	return (0);
+}
+
 /*
 * Insert new key in deadlist, which must be > all current entries.
 * mintxg is not inclusive.
@@ -316,6 +414,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 	mutex_exit(&dl->dl_lock);
 }

+/*
+ * Remove a deadlist entry and all of its contents by removing the entry from
+ * the deadlist's avl tree, freeing the entry's bpobj and adjusting the
+ * deadlist's space accounting accordingly.
+ */
+void
+dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	uint64_t used, comp, uncomp;
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	objset_t *os = dl->dl_os;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+	VERIFY3P(dle, !=, NULL);
+
+	avl_remove(&dl->dl_tree, dle);
+	VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
+	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
+	dl->dl_phys->dl_used -= used;
+	dl->dl_phys->dl_comp -= comp;
+	dl->dl_phys->dl_uncomp -= uncomp;
+	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
+		bpobj_decr_empty(os, tx);
+	} else {
+		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
+	}
+	bpobj_close(&dle->dle_bpobj);
+	kmem_free(dle, sizeof (*dle));
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Clear out the contents of a deadlist_entry by freeing its bpobj,
+ * replacing it with an empty bpobj and adjusting the deadlist's
+ * space accounting
+ */
+void
+dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
+    dmu_tx_t *tx)
+{
+	uint64_t new_obj, used, comp, uncomp;
+	objset_t *os = dl->dl_os;
+
+	mutex_enter(&dl->dl_lock);
+	VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
+	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
+	dl->dl_phys->dl_used -= used;
+	dl->dl_phys->dl_comp -= comp;
+	dl->dl_phys->dl_uncomp -= uncomp;
+	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
+		bpobj_decr_empty(os, tx);
+	else
+		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
+	bpobj_close(&dle->dle_bpobj);
+	new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
+	VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
+	VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
+	    new_obj, tx));
+	ASSERT(bpobj_is_empty(&dle->dle_bpobj));
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Return the first entry in deadlist's avl tree
+ */
+dsl_deadlist_entry_t *
+dsl_deadlist_first(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	dle = avl_first(&dl->dl_tree);
+	mutex_exit(&dl->dl_lock);
+
+	return (dle);
+}
+
+/*
+ * Return the last entry in deadlist's avl tree
+ */
+dsl_deadlist_entry_t *
+dsl_deadlist_last(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	dle = avl_last(&dl->dl_tree);
+	mutex_exit(&dl->dl_lock);
+
+	return (dle);
+}
+
 /*
 * Walk ds's snapshots to regenerate generate ZAP & AVL.
 */
@@ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
 }

 static int
-dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
-	dsl_deadlist_insert(dl, bp, tx);
+	dsl_deadlist_insert(dl, bp, bp_freed, tx);
 	return (0);
 }

@@ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
 	}
 	mutex_exit(&dl->dl_lock);
 }
+
+typedef struct livelist_entry {
+	const blkptr_t *le_bp;
+	avl_node_t le_node;
+} livelist_entry_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+	const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
+	const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
+
+	/* Sort them according to dva[0] */
+	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+	uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+
+	if (l_dva0_vdev != r_dva0_vdev)
+		return (AVL_CMP(l_dva0_vdev, r_dva0_vdev));
+
+	/* if vdevs are equal, sort by offsets. */
+	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+	if (l_dva0_offset == r_dva0_offset)
+		ASSERT3U(l->blk_birth, ==, r->blk_birth);
+	return (AVL_CMP(l_dva0_offset, r_dva0_offset));
+}
+
+struct livelist_iter_arg {
+	avl_tree_t *avl;
+	bplist_t *to_free;
+	zthr_t *t;
+};
+
+/*
+ * Expects an AVL tree which is incrementally filled will FREE blkptrs
+ * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
+ * corresponding FREE are stored in the supplied bplist.
+ */
+static int
+dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	struct livelist_iter_arg *lia = arg;
+	avl_tree_t *avl = lia->avl;
+	bplist_t *to_free = lia->to_free;
+	zthr_t *t = lia->t;
+	ASSERT(tx == NULL);
+
+	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
+		return (SET_ERROR(EINTR));
+	if (bp_freed) {
+		livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
+		    KM_SLEEP);
+		blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+		*temp_bp = *bp;
+		node->le_bp = temp_bp;
+		avl_add(avl, node);
+	} else {
+		livelist_entry_t node;
+		node.le_bp = bp;
+		livelist_entry_t *found = avl_find(avl, &node, NULL);
+		if (found != NULL) {
+			avl_remove(avl, found);
+			kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
+			kmem_free(found, sizeof (livelist_entry_t));
+		} else {
+			bplist_append(to_free, bp);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
+ * which have an ALLOC entry but no matching FREE
+ */
+int
+dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
+    uint64_t *size)
+{
+	avl_tree_t avl;
+	avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
+	    offsetof(livelist_entry_t, le_node));
+
+	/* process the sublist */
+	struct livelist_iter_arg arg = {
+	    .avl = &avl,
+	    .to_free = to_free,
+	    .t = t
+	};
+	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
+
+	avl_destroy(&avl);
+	return (err);
+}
+
+#if defined(_KERNEL)
+/* CSTYLED */
+module_param(zfs_livelist_max_entries, ulong, 0644);
+MODULE_PARM_DESC(zfs_livelist_max_entries,
+	"Size to start the next sub-livelist in a livelist");
+
+module_param(zfs_livelist_min_percent_shared, int, 0644);
+MODULE_PARM_DESC(zfs_livelist_min_percent_shared,
+	"Threshold at which livelist is disabled");
+#endif
@@ -45,6 +45,9 @@
 #include <sys/dmu_impl.h>
 #include <sys/zvol.h>
 #include <sys/zcp.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/zthr.h>
+#include <sys/spa_impl.h>

 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
@@ -120,7 +123,7 @@ struct process_old_arg {
 };

 static int
-process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	struct process_old_arg *poa = arg;
 	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
@@ -128,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 	ASSERT(!BP_IS_HOLE(bp));

 	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
-		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
 		    bp->blk_birth >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
@@ -852,6 +855,127 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
 	dmu_object_free_zapified(mos, ddobj, tx);
 }

+static void
+dsl_clone_destroy_assert(dsl_dir_t *dd)
+{
+	uint64_t used, comp, uncomp;
+
+	ASSERT(dsl_dir_is_clone(dd));
+	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
+
+	ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used);
+	ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp);
+	/*
+	 * Greater than because we do not track embedded block pointers in
+	 * the livelist
+	 */
+	ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp);
+
+	ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list));
+	ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list));
+}
+
+/*
+ * Start the delete process for a clone. Free its zil, verify the space usage
+ * and queue the blkptrs for deletion by adding the livelist to the pool-wide
+ * delete queue.
+ */
+static void
+dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t zap_obj, to_delete, used, comp, uncomp;
+	objset_t *os;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+
+	/* Check that the clone is in a correct state to be deleted */
+	dsl_clone_destroy_assert(dd);
+
+	/* Destroy the zil */
+	zil_destroy_sync(dmu_objset_zil(os), tx);
+
+	VERIFY0(zap_lookup(mos, dd->dd_object,
+	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete));
+	/* Initialize deleted_clones entry to track livelists to cleanup */
+	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+	if (error == ENOENT) {
+		zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA,
+		    DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1,
+		    &(zap_obj), tx));
+		spa->spa_livelists_to_delete = zap_obj;
+	} else if (error != 0) {
+		zfs_panic_recover("zfs: error %d was returned while looking "
+		    "up DMU_POOL_DELETED_CLONES in the zap");
+		return;
+	}
+	VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
+
+	/* Clone is no longer using space, now tracked by dp_free_dir */
+	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
+	dsl_dir_diduse_space(dd, DD_USED_HEAD,
+	    -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes,
+	    tx);
+	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+	    used, comp, uncomp, tx);
+	dsl_dir_remove_livelist(dd, tx, B_FALSE);
+	zthr_wakeup(spa->spa_livelist_delete_zthr);
+}
+
+/*
+ * Move the bptree into the pool's list of trees to clean up, update space
+ * accounting information and destroy the zil.
+ */
+void
+dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t used, comp, uncomp;
+	objset_t *os;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+
+	zil_destroy_sync(dmu_objset_zil(os), tx);
+
+	if (!spa_feature_is_active(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY)) {
+		dsl_scan_t *scn = dp->dp_scan;
+		spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+		    tx);
+		dp->dp_bptree_obj = bptree_alloc(mos, tx);
+		VERIFY0(zap_add(mos,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+		    &dp->dp_bptree_obj, tx));
+		ASSERT(!scn->scn_async_destroying);
+		scn->scn_async_destroying = B_TRUE;
+	}
+
+	used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+	comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+	uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
+
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == used);
+
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	bptree_add(mos, dp->dp_bptree_obj,
+	    &dsl_dataset_phys(ds)->ds_bp,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+	    used, comp, uncomp, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+	    -used, -comp, -uncomp, tx);
+	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+	    used, comp, uncomp, tx);
+}
+
 void
 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
@@ -911,7 +1035,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 	}

 	/*
-	 * Destroy the deadlist.  Unless it's a clone, the
+	 * Destroy the deadlist. Unless it's a clone, the
 	 * deadlist should be empty since the dataset has no snapshots.
 	 * (If it's a clone, it's safe to ignore the deadlist contents
 	 * since they are still referenced by the origin snapshot.)
@@ -924,51 +1048,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 	if (dsl_dataset_remap_deadlist_exists(ds))
 		dsl_dataset_destroy_remap_deadlist(ds, tx);

-	objset_t *os;
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
-		old_synchronous_dataset_destroy(ds, tx);
+	/*
+	 * Each destroy is responsible for both destroying (enqueuing
+	 * to be destroyed) the blkptrs comprising the dataset as well as
+	 * those belonging to the zil.
+	 */
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+		dsl_async_clone_destroy(ds, tx);
+	} else if (spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY)) {
+		dsl_async_dataset_destroy(ds, tx);
 	} else {
-		/*
-		 * Move the bptree into the pool's list of trees to
-		 * clean up and update space accounting information.
-		 */
-		uint64_t used, comp, uncomp;
-
-		zil_destroy_sync(dmu_objset_zil(os), tx);
-
-		if (!spa_feature_is_active(dp->dp_spa,
-		    SPA_FEATURE_ASYNC_DESTROY)) {
-			dsl_scan_t *scn = dp->dp_scan;
-			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
-			    tx);
-			dp->dp_bptree_obj = bptree_alloc(mos, tx);
-			VERIFY0(zap_add(mos,
-			    DMU_POOL_DIRECTORY_OBJECT,
-			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
-			    &dp->dp_bptree_obj, tx));
-			ASSERT(!scn->scn_async_destroying);
-			scn->scn_async_destroying = B_TRUE;
-		}
-
-		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
-		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
-		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
-
-		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
-
-		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		bptree_add(mos, dp->dp_bptree_obj,
-		    &dsl_dataset_phys(ds)->ds_bp,
-		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
-		    used, comp, uncomp, tx);
-		rrw_exit(&ds->ds_bp_rwlock, FTAG);
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-		    -used, -comp, -uncomp, tx);
-		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-		    used, comp, uncomp, tx);
+		old_synchronous_dataset_destroy(ds, tx);
 	}

 	if (ds->ds_prev != NULL) {
@@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2013 Martin Matuska. All rights reserved.
 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -48,6 +48,7 @@
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
 #include <sys/zvol.h>
+#include <sys/zthr.h>
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"

@@ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu)

 	spa_async_close(dd->dd_pool->dp_spa, dd);

+	if (dsl_deadlist_is_open(&dd->dd_livelist))
+		dsl_dir_livelist_close(dd);
+
 	dsl_prop_fini(dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
@@ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 			dd->dd_origin_txg =
 			    origin_phys->ds_creation_txg;
 			dmu_buf_rele(origin_bonus, FTAG);
+			if (dsl_dir_is_zapified(dd)) {
+				uint64_t obj;
+				err = zap_lookup(dp->dp_meta_objset,
+				    dd->dd_object, DD_FIELD_LIVELIST,
+				    sizeof (uint64_t), 1, &obj);
+				if (err == 0)
+					dsl_dir_livelist_open(dd, obj);
+				else if (err != ENOENT)
+					goto errout;
+			}
 		}

 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
@@ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 		if (winner != NULL) {
 			if (dd->dd_parent)
 				dsl_dir_rele(dd->dd_parent, dd);
+			if (dsl_deadlist_is_open(&dd->dd_livelist))
+				dsl_dir_livelist_close(dd);
 			dsl_prop_fini(dd);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
@@ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 errout:
 	if (dd->dd_parent)
 		dsl_dir_rele(dd->dd_parent, dd);
+	if (dsl_deadlist_is_open(&dd->dd_livelist))
+		dsl_dir_livelist_close(dd);
 	dsl_prop_fini(dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
@@ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd)
 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }

+void
+dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
+	    SPA_FEATURE_LIVELIST));
+	dsl_deadlist_open(&dd->dd_livelist, mos, obj);
+	bplist_create(&dd->dd_pending_allocs);
+	bplist_create(&dd->dd_pending_frees);
+}
+
+void
+dsl_dir_livelist_close(dsl_dir_t *dd)
+{
+	dsl_deadlist_close(&dd->dd_livelist);
+	bplist_destroy(&dd->dd_pending_allocs);
+	bplist_destroy(&dd->dd_pending_frees);
+}
+
+void
+dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
+{
+	uint64_t obj;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	livelist_condense_entry_t to_condense = spa->spa_to_condense;
+
+	if (!dsl_deadlist_is_open(&dd->dd_livelist))
+		return;
+
+	/*
+	 * If the livelist being removed is set to be condensed, stop the
+	 * condense zthr and indicate the cancellation in the spa_to_condense
+	 * struct in case the condense no-wait synctask has already started
+	 */
+	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+	if (ll_condense_thread != NULL &&
+	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
+			/*
+			 * We use zthr_wait_cycle_done instead of zthr_cancel
+			 * because we don't want to destroy the zthr, just have
+			 * it skip its current task.
+			 */
+			spa->spa_to_condense.cancelled = B_TRUE;
+			zthr_wait_cycle_done(ll_condense_thread);
+			/*
+			 * If we've returned from zthr_wait_cycle_done without
+			 * clearing the to_condense data structure it's either
+			 * because the no-wait synctask has started (which is
+			 * indicated by 'syncing' field of to_condense) and we
+			 * can expect it to clear to_condense on its own.
+			 * Otherwise, we returned before the zthr ran. The
+			 * checkfunc will now fail as cancelled == B_TRUE so we
+			 * can safely NULL out ds, allowing a different dir's
+			 * livelist to be condensed.
+			 *
+			 * We can be sure that the to_condense struct will not
+			 * be repopulated at this stage because both this
+			 * function and dsl_livelist_try_condense execute in
+			 * syncing context.
+			 */
+			if ((spa->spa_to_condense.ds != NULL) &&
+			    !spa->spa_to_condense.syncing) {
+				dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
+				    spa);
+				spa->spa_to_condense.ds = NULL;
+			}
+	}
+
+	dsl_dir_livelist_close(dd);
+	int err = zap_lookup(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj);
+	if (err == 0) {
+		VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
+		    DD_FIELD_LIVELIST, tx));
+		if (total) {
+			dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
+			spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+		}
+	} else {
+		ASSERT3U(err, !=, ENOENT);
+	}
+}
+
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
@@ -721,7 +721,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 * Now that the datasets have been completely synced, we can
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
-	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
+	 *  - move dead blocks from the pending deadlist and livelists
+	 *    to the on-disk versions
 	 *  - release hold from dsl_dataset_dirty()
 	 *  - release key mapping hold from dsl_dataset_dirty()
 	 */
@@ -3103,8 +3103,18 @@ dsl_scan_update_stats(dsl_scan_t *scn)
 }

 static int
-dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
 {
+	ASSERT(!bp_freed);
+	return (dsl_scan_free_block_cb(arg, bp, tx));
+}
+
+static int
+dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(!bp_freed);
 	dsl_scan_t *scn = arg;
 	const dva_t *dva = &bp->blk_dva[0];

@@ -3123,6 +3133,7 @@ dsl_scan_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t used = 0, comp, uncomp;
+	boolean_t clones_left;

 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
@@ -3136,7 +3147,8 @@ dsl_scan_active(dsl_scan_t *scn)
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
 	}
-	return (used != 0);
+	clones_left = spa_livelist_delete_check(spa);
+	return ((used != 0) || (clones_left));
 }

 static boolean_t
@@ -3233,7 +3245,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
-		    dsl_scan_free_block_cb, scn, tx);
+		    bpobj_dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;

@@ -3330,7 +3342,8 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 	}

-	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+	    !spa_livelist_delete_check(spa)) {
 		/* finished; verify that space accounting went to zero */
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
@@ -232,6 +232,27 @@ uint64_t	zfs_max_missing_tvds_scan = 0;
 */
 boolean_t	zfs_pause_spa_sync = B_FALSE;

+/*
+ * Variables to indicate the livelist condense zthr func should wait at certain
+ * points for the livelist to be removed - used to test condense/destroy races
+ */
+int zfs_livelist_condense_zthr_pause = 0;
+int zfs_livelist_condense_sync_pause = 0;
+
+/*
+ * Variables to track whether or not condense cancellation has been
+ * triggered in testing.
+ */
+int zfs_livelist_condense_sync_cancel = 0;
+int zfs_livelist_condense_zthr_cancel = 0;
+
+/*
+ * Variable to track whether or not extra ALLOC blkptrs were added to a
+ * livelist entry while it was being condensed (caused by the way we track
+ * remapped blkptrs in dbuf_remap_impl)
+ */
+int zfs_livelist_condense_new_alloc = 0;
+
 /*
 * ==========================================================================
 * SPA properties routines
@@ -1481,6 +1502,27 @@ spa_unload_log_sm_metadata(spa_t *spa)
 	spa->spa_unflushed_stats.sus_blocklimit = 0;
 }

+static void
+spa_destroy_aux_threads(spa_t *spa)
+{
+	if (spa->spa_condense_zthr != NULL) {
+		zthr_destroy(spa->spa_condense_zthr);
+		spa->spa_condense_zthr = NULL;
+	}
+	if (spa->spa_checkpoint_discard_zthr != NULL) {
+		zthr_destroy(spa->spa_checkpoint_discard_zthr);
+		spa->spa_checkpoint_discard_zthr = NULL;
+	}
+	if (spa->spa_livelist_delete_zthr != NULL) {
+		zthr_destroy(spa->spa_livelist_delete_zthr);
+		spa->spa_livelist_delete_zthr = NULL;
+	}
+	if (spa->spa_livelist_condense_zthr != NULL) {
+		zthr_destroy(spa->spa_livelist_condense_zthr);
+		spa->spa_livelist_condense_zthr = NULL;
+	}
+}
+
 /*
 * Opposite of spa_load().
 */
@@ -1552,15 +1594,7 @@ spa_unload(spa_t *spa)
 		spa->spa_vdev_removal = NULL;
 	}

-	if (spa->spa_condense_zthr != NULL) {
-		zthr_destroy(spa->spa_condense_zthr);
-		spa->spa_condense_zthr = NULL;
-	}
-
-	if (spa->spa_checkpoint_discard_zthr != NULL) {
-		zthr_destroy(spa->spa_checkpoint_discard_zthr);
-		spa->spa_checkpoint_discard_zthr = NULL;
-	}
+	spa_destroy_aux_threads(spa);

 	spa_condense_fini(spa);

@@ -2335,6 +2369,376 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 	return (SET_ERROR(err));
 }

+boolean_t
+spa_livelist_delete_check(spa_t *spa)
+{
+	return (spa->spa_livelists_to_delete != 0);
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_livelist_delete_cb_check(void *arg, zthr_t *z)
+{
+	spa_t *spa = arg;
+	return (spa_livelist_delete_check(spa));
+}
+
+static int
+delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	spa_t *spa = arg;
+	zio_free(spa, tx->tx_txg, bp);
+	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+	    -bp_get_dsize_sync(spa, bp),
+	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+	return (0);
+}
+
+static int
+dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
+{
+	int err;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zap_cursor_init(&zc, os, zap_obj);
+	err = zap_cursor_retrieve(&zc, &za);
+	zap_cursor_fini(&zc);
+	if (err == 0)
+		*llp = za.za_first_integer;
+	return (err);
+}
+
+/*
+ * Components of livelist deletion that must be performed in syncing
+ * context: freeing block pointers and updating the pool-wide data
+ * structures to indicate how much work is left to do
+ */
+typedef struct sublist_delete_arg {
+	spa_t *spa;
+	dsl_deadlist_t *ll;
+	uint64_t key;
+	bplist_t *to_free;
+} sublist_delete_arg_t;
+
+static void
+sublist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+	sublist_delete_arg_t *sda = arg;
+	spa_t *spa = sda->spa;
+	dsl_deadlist_t *ll = sda->ll;
+	uint64_t key = sda->key;
+	bplist_t *to_free = sda->to_free;
+
+	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
+	dsl_deadlist_remove_entry(ll, key, tx);
+}
+
+typedef struct livelist_delete_arg {
+	spa_t *spa;
+	uint64_t ll_obj;
+	uint64_t zap_obj;
+} livelist_delete_arg_t;
+
+static void
+livelist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+	livelist_delete_arg_t *lda = arg;
+	spa_t *spa = lda->spa;
+	uint64_t ll_obj = lda->ll_obj;
+	uint64_t zap_obj = lda->zap_obj;
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t count;
+
+	/* free the livelist and decrement the feature count */
+	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
+	dsl_deadlist_free(mos, ll_obj, tx);
+	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+	VERIFY0(zap_count(mos, zap_obj, &count));
+	if (count == 0) {
+		/* no more livelists to delete */
+		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DELETED_CLONES, tx));
+		VERIFY0(zap_destroy(mos, zap_obj, tx));
+		spa->spa_livelists_to_delete = 0;
+	}
+}
+
+/*
+ * Load in the value for the livelist to be removed and open it. Then,
+ * load its first sublist and determine which block pointers should actually
+ * be freed. Then, call a synctask which performs the actual frees and updates
+ * the pool-wide livelist data.
+ */
+/* ARGSUSED */
+void
+spa_livelist_delete_cb(void *arg, zthr_t *z)
+{
+	spa_t *spa = arg;
+	uint64_t ll_obj = 0, count;
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t zap_obj = spa->spa_livelists_to_delete;
+	/*
+	 * Determine the next livelist to delete. This function should only
+	 * be called if there is at least one deleted clone.
+	 */
+	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
+	VERIFY0(zap_count(mos, ll_obj, &count));
+	if (count > 0) {
+		dsl_deadlist_t ll = { 0 };
+		dsl_deadlist_entry_t *dle;
+		bplist_t to_free;
+		dsl_deadlist_open(&ll, mos, ll_obj);
+		dle = dsl_deadlist_first(&ll);
+		ASSERT3P(dle, !=, NULL);
+		bplist_create(&to_free);
+		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
+		    z, NULL);
+		if (err == 0) {
+			sublist_delete_arg_t sync_arg = {
+			    .spa = spa,
+			    .ll = &ll,
+			    .key = dle->dle_mintxg,
+			    .to_free = &to_free
+			};
+			zfs_dbgmsg("deleting sublist (id %llu) from"
+			    " livelist %llu, %d remaining",
+			    dle->dle_bpobj.bpo_object, ll_obj, count - 1);
+			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+			    sublist_delete_sync, &sync_arg, 0,
+			    ZFS_SPACE_CHECK_DESTROY));
+		} else {
+			ASSERT(err == EINTR);
+		}
+		bplist_clear(&to_free);
+		bplist_destroy(&to_free);
+		dsl_deadlist_close(&ll);
+	} else {
+		livelist_delete_arg_t sync_arg = {
+		    .spa = spa,
+		    .ll_obj = ll_obj,
+		    .zap_obj = zap_obj
+		};
+		zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
+		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
+		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
+	}
+}
+
+void
+spa_start_livelist_destroy_thread(spa_t *spa)
+{
+	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
+	spa->spa_livelist_delete_zthr = zthr_create(
+	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
+}
+
+typedef struct livelist_new_arg {
+	bplist_t *allocs;
+	bplist_t *frees;
+} livelist_new_arg_t;
+
+static int
+livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(tx == NULL);
+	livelist_new_arg_t *lna = arg;
+	if (bp_freed) {
+		bplist_append(lna->frees, bp);
+	} else {
+		bplist_append(lna->allocs, bp);
+		zfs_livelist_condense_new_alloc++;
+	}
+	return (0);
+}
+
+typedef struct livelist_condense_arg {
+	spa_t *spa;
+	bplist_t to_keep;
+	uint64_t first_size;
+	uint64_t next_size;
+} livelist_condense_arg_t;
+
+static void
+spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
+{
+	livelist_condense_arg_t *lca = arg;
+	spa_t *spa = lca->spa;
+	bplist_t new_frees;
+	dsl_dataset_t *ds = spa->spa_to_condense.ds;
+
+	/* Have we been cancelled? */
+	if (spa->spa_to_condense.cancelled) {
+		zfs_livelist_condense_sync_cancel++;
+		goto out;
+	}
+
+	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+
+	/*
+	 * It's possible that the livelist was changed while the zthr was
+	 * running. Therefore, we need to check for new blkptrs in the two
+	 * entries being condensed and continue to track them in the livelist.
+	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
+	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
+	 * we need to sort them into two different bplists.
+	 */
+	uint64_t first_obj = first->dle_bpobj.bpo_object;
+	uint64_t next_obj = next->dle_bpobj.bpo_object;
+	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+
+	bplist_create(&new_frees);
+	livelist_new_arg_t new_bps = {
+	    .allocs = &lca->to_keep,
+	    .frees = &new_frees,
+	};
+
+	if (cur_first_size > lca->first_size) {
+		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
+		    livelist_track_new_cb, &new_bps, lca->first_size));
+	}
+	if (cur_next_size > lca->next_size) {
+		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
+		    livelist_track_new_cb, &new_bps, lca->next_size));
+	}
+
+	dsl_deadlist_clear_entry(first, ll, tx);
+	ASSERT(bpobj_is_empty(&first->dle_bpobj));
+	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
+
+	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
+	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
+	bplist_destroy(&new_frees);
+
+	char dsname[ZFS_MAX_DATASET_NAME_LEN];
+	dsl_dataset_name(ds, dsname);
+	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
+	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
+	    "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
+	    cur_first_size, next_obj, cur_next_size,
+	    first->dle_bpobj.bpo_object,
+	    first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
+out:
+	dmu_buf_rele(ds->ds_dbuf, spa);
+	spa->spa_to_condense.ds = NULL;
+	bplist_clear(&lca->to_keep);
+	bplist_destroy(&lca->to_keep);
+	kmem_free(lca, sizeof (livelist_condense_arg_t));
+	spa->spa_to_condense.syncing = B_FALSE;
+}
+
+void
+spa_livelist_condense_cb(void *arg, zthr_t *t)
+{
+	while (zfs_livelist_condense_zthr_pause &&
+	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+		delay(1);
+
+	spa_t *spa = arg;
+	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+	uint64_t first_size, next_size;
+
+	livelist_condense_arg_t *lca =
+	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
+	bplist_create(&lca->to_keep);
+
+	/*
+	 * Process the livelists (matching FREEs and ALLOCs) in open context
+	 * so we have minimal work in syncing context to condense.
+	 *
+	 * We save bpobj sizes (first_size and next_size) to use later in
+	 * syncing context to determine if entries were added to these sublists
+	 * while in open context. This is possible because the clone is still
+	 * active and open for normal writes and we want to make sure the new,
+	 * unprocessed blockpointers are inserted into the livelist normally.
+	 *
+	 * Note that dsl_process_sub_livelist() both stores the size number of
+	 * blockpointers and iterates over them while the bpobj's lock held, so
+	 * the sizes returned to us are consistent which what was actually
+	 * processed.
+	 */
+	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
+	    &first_size);
+	if (err == 0)
+		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
+		    t, &next_size);
+
+	if (err == 0) {
+		while (zfs_livelist_condense_sync_pause &&
+		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+			delay(1);
+
+		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+		dmu_tx_mark_netfree(tx);
+		dmu_tx_hold_space(tx, 1);
+		err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
+		if (err == 0) {
+			/*
+			 * Prevent the condense zthr restarting before
+			 * the synctask completes.
+			 */
+			spa->spa_to_condense.syncing = B_TRUE;
+			lca->spa = spa;
+			lca->first_size = first_size;
+			lca->next_size = next_size;
+			dsl_sync_task_nowait(spa_get_dsl(spa),
+			    spa_livelist_condense_sync, lca, 0,
+			    ZFS_SPACE_CHECK_NONE, tx);
+			dmu_tx_commit(tx);
+			return;
+		}
+	}
+	/*
+	 * Condensing can not continue: either it was externally stopped or
+	 * we were unable to assign to a tx because the pool has run out of
+	 * space. In the second case, we'll just end up trying to condense
+	 * again in a later txg.
+	 */
+	ASSERT(err != 0);
+	bplist_clear(&lca->to_keep);
+	bplist_destroy(&lca->to_keep);
+	kmem_free(lca, sizeof (livelist_condense_arg_t));
+	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
+	spa->spa_to_condense.ds = NULL;
+	if (err == EINTR)
+		zfs_livelist_condense_zthr_cancel++;
+}
+
+/* ARGSUSED */
+/*
+ * Check that there is something to condense but that a condense is not
+ * already in progress and that condensing has not been cancelled.
+ */
+static boolean_t
+spa_livelist_condense_cb_check(void *arg, zthr_t *z)
+{
+	spa_t *spa = arg;
+	if ((spa->spa_to_condense.ds != NULL) &&
+	    (spa->spa_to_condense.syncing == B_FALSE) &&
+	    (spa->spa_to_condense.cancelled == B_FALSE)) {
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+void
+spa_start_livelist_condensing_thread(spa_t *spa)
+{
+	spa->spa_to_condense.ds = NULL;
+	spa->spa_to_condense.first = NULL;
+	spa->spa_to_condense.next = NULL;
+	spa->spa_to_condense.syncing = B_FALSE;
+	spa->spa_to_condense.cancelled = B_FALSE;
+
+	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
+	spa->spa_livelist_condense_zthr = zthr_create(
+	    spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa);
+}
+
 static void
 spa_spawn_aux_threads(spa_t *spa)
 {
@@ -2343,6 +2747,8 @@ spa_spawn_aux_threads(spa_t *spa)
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));

 	spa_start_indirect_condensing_thread(spa);
+	spa_start_livelist_destroy_thread(spa);
+	spa_start_livelist_condensing_thread(spa);

 	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
 	spa->spa_checkpoint_discard_zthr =
@@ -3603,6 +4009,15 @@ spa_ld_get_props(spa_t *spa)
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

+	/*
+	 * Load the livelist deletion field. If a livelist is queued for
+	 * deletion, indicate that in the spa
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
+	    &spa->spa_livelists_to_delete, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
@@ -7571,6 +7986,14 @@ spa_async_suspend(spa_t *spa)
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_cancel(discard_thread);
+
+	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+	if (ll_delete_thread != NULL)
+		zthr_cancel(ll_delete_thread);
+
+	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+	if (ll_condense_thread != NULL)
+		zthr_cancel(ll_condense_thread);
 }

 void
@@ -7589,6 +8012,14 @@ spa_async_resume(spa_t *spa)
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_resume(discard_thread);
+
+	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+	if (ll_delete_thread != NULL)
+		zthr_resume(ll_delete_thread);
+
+	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+	if (ll_condense_thread != NULL)
+		zthr_resume(ll_condense_thread);
 }

 static boolean_t
@@ -7639,14 +8070,28 @@ spa_async_request(spa_t *spa, int task)
 * ==========================================================================
 */

+
 static int
-bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
-	bpobj_enqueue(bpo, bp, tx);
+	bpobj_enqueue(bpo, bp, bp_freed, tx);
 	return (0);
 }

+int
+bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
+}
+
+int
+bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
+}
+
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
@@ -7657,6 +8102,14 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 	return (0);
 }

+static int
+bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(!bp_freed);
+	return (spa_free_sync_cb(arg, bp, tx));
+}
+
 /*
 * Note: this simple function is not inlined to make it easier to dtrace the
 * amount of time spent syncing frees.
@@ -7693,7 +8146,7 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
-	    spa_free_sync_cb, zio, tx), ==, 0);
+	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }

@@ -8296,7 +8749,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
-			bplist_iterate(free_bpl, bpobj_enqueue_cb,
+			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}

@@ -8884,4 +9337,24 @@ MODULE_PARM_DESC(zfs_max_missing_tvds,
 	" (in read-only mode)");
 /* END CSTYLED */

+module_param(zfs_livelist_condense_zthr_pause, int, 0644);
+MODULE_PARM_DESC(zfs_livelist_condense_zthr_pause,
+	"Set the livelist condense zthr to pause");
+module_param(zfs_livelist_condense_sync_pause, int, 0644);
+MODULE_PARM_DESC(zfs_livelist_condense_sync_pause,
+	"Set the livelist condense synctask to pause");
+
+module_param(zfs_livelist_condense_sync_cancel, int, 0644);
+MODULE_PARM_DESC(zfs_livelist_condense_sync_cancel,
+	"Whether livelist condensing was canceled in the synctask");
+module_param(zfs_livelist_condense_zthr_cancel, int, 0644);
+MODULE_PARM_DESC(zfs_livelist_condense_zthr_cancel,
+	"Whether livelist condensing was canceled in the zthr function");
+
+/* BEGIN CSTYLED */
+module_param(zfs_livelist_condense_new_alloc, int, 0644);
+MODULE_PARM_DESC(zfs_livelist_condense_new_alloc,
+	"Whether extra ALLOC blkptrs were added to a livelist entry while it"
+	" was being condensed");
+/* END CSTYLED */
 #endif
@@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 * Copyright 2017 Joyent, Inc.
 */
@@ -413,7 +413,6 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)

 	/* spa_history_log_sync will free nvl */
 	return (err);
-
 }

 /*
@@ -207,12 +207,15 @@ struct zthr {
 	/* flag set to true if we are canceling the zthr */
 	boolean_t	zthr_cancel;

+	/* flag set to true if we are waiting for the zthr to finish */
+	boolean_t	zthr_haswaiters;
+	kcondvar_t	zthr_wait_cv;
 	/*
 	 * maximum amount of time that the zthr is spent sleeping;
 	 * if this is 0, the thread doesn't wake up until it gets
 	 * signaled.
 	 */
-	hrtime_t	zthr_wait_time;
+	hrtime_t	zthr_sleep_timeout;

 	/* consumer-provided callbacks & data */
 	zthr_checkfunc_t	*zthr_checkfunc;
@@ -239,14 +242,18 @@ zthr_procedure(void *arg)
 			 * order to prevent this process from incorrectly
 			 * contributing to the system load average when idle.
 			 */
-			if (t->zthr_wait_time == 0) {
+			if (t->zthr_sleep_timeout == 0) {
 				cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock);
 			} else {
 				(void) cv_timedwait_sig_hires(&t->zthr_cv,
-				    &t->zthr_state_lock, t->zthr_wait_time,
+				    &t->zthr_state_lock, t->zthr_sleep_timeout,
 				    MSEC2NSEC(1), 0);
 			}
 		}
+		if (t->zthr_haswaiters) {
+			t->zthr_haswaiters = B_FALSE;
+			cv_broadcast(&t->zthr_wait_cv);
+		}
 	}

 	/*
@@ -280,12 +287,13 @@ zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
 	mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL);

 	mutex_enter(&t->zthr_state_lock);
 	t->zthr_checkfunc = checkfunc;
 	t->zthr_func = func;
 	t->zthr_arg = arg;
-	t->zthr_wait_time = max_sleep;
+	t->zthr_sleep_timeout = max_sleep;

 	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
 	    0, &p0, TS_RUN, minclsyspri);
@@ -303,6 +311,7 @@ zthr_destroy(zthr_t *t)
 	mutex_destroy(&t->zthr_request_lock);
 	mutex_destroy(&t->zthr_state_lock);
 	cv_destroy(&t->zthr_cv);
+	cv_destroy(&t->zthr_wait_cv);
 	kmem_free(t, sizeof (*t));
 }

@@ -355,9 +364,8 @@ zthr_cancel(zthr_t *t)
 	 *
 	 * [1] The thread has already been cancelled, therefore
 	 *     there is nothing for us to do.
-	 * [2] The thread is sleeping, so we broadcast the CV first
-	 *     to wake it up and then we set the flag and we are
-	 *     waiting for it to exit.
+	 * [2] The thread is sleeping so we set the flag, broadcast
+	 *     the CV and wait for it to exit.
 	 * [3] The thread is doing work, in which case we just set
 	 *     the flag and wait for it to finish.
 	 * [4] The thread was just created/resumed, in which case
@@ -397,6 +405,7 @@ zthr_resume(zthr_t *t)
 	ASSERT3P(&t->zthr_checkfunc, !=, NULL);
 	ASSERT3P(&t->zthr_func, !=, NULL);
 	ASSERT(!t->zthr_cancel);
+	ASSERT(!t->zthr_haswaiters);

 	/*
 	 * There are 4 states that we find the zthr in at this point
@@ -451,3 +460,74 @@ zthr_iscancelled(zthr_t *t)
 	mutex_exit(&t->zthr_state_lock);
 	return (cancelled);
 }
+
+/*
+ * Wait for the zthr to finish its current function. Similar to
+ * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
+ * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was
+ * sleeping or cancelled, return immediately.
+ */
+void
+zthr_wait_cycle_done(zthr_t *t)
+{
+	mutex_enter(&t->zthr_state_lock);
+
+	/*
+	 * Since we are holding the zthr_state_lock at this point
+	 * we can find the state in one of the following 5 states:
+	 *
+	 * [1] The thread has already cancelled, therefore
+	 *     there is nothing for us to do.
+	 * [2] The thread is sleeping so we set the flag, broadcast
+	 *     the CV and wait for it to exit.
+	 * [3] The thread is doing work, in which case we just set
+	 *     the flag and wait for it to finish.
+	 * [4] The thread was just created/resumed, in which case
+	 *     the behavior is similar to [3].
+	 * [5] The thread is the middle of being cancelled, which is
+	 *     similar to [3]. We'll wait for the cancel, which is
+	 *     waiting for the zthr func.
+	 *
+	 * Since requests are serialized, by the time that we get
+	 * control back we expect that the zthr has completed it's
+	 * zthr_func.
+	 */
+	if (t->zthr_thread != NULL) {
+		t->zthr_haswaiters = B_TRUE;
+
+		/* broadcast in case the zthr is sleeping */
+		cv_broadcast(&t->zthr_cv);
+
+		while ((t->zthr_haswaiters) && (t->zthr_thread != NULL))
+			cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock);
+
+		ASSERT(!t->zthr_haswaiters);
+	}
+
+	mutex_exit(&t->zthr_state_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * to check if another thread is waiting on it to finish
+ *
+ * returns TRUE if we have been asked to finish.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_has_waiters(zthr_t *t)
+{
+	ASSERT3P(t->zthr_thread, ==, curthread);
+
+	mutex_enter(&t->zthr_state_lock);
+
+	/*
+	 * Similarly to zthr_iscancelled(), we only grab the
+	 * zthr_state_lock so that the zthr itself can use this
+	 * to check for the request.
+	 */
+	boolean_t has_waiters = t->zthr_haswaiters;
+	mutex_exit(&t->zthr_state_lock);
+	return (has_waiters);
+}