ddt: dedup log

Adds a log/journal to dedup. At the end of txg, instead of writing the entry directly to the ZAP, instead its adding to an in-memory tree and appended to an on-disk object. The on-disk object is only read at import, to reload the in-memory tree. Lookups first go the the log tree before going to the ZAP, so recently-used entries will remain close by in memory. This vastly reduces overhead from dedup IO, as it will not have to do so many read/update/write cycles on ZAP leaf nodes. A flushing facility is added at end of txg, to push logged entries out to the ZAP. There's actually two separate "logs" (in-memory tree and on-disk object), one active (recieving updated entries) and one flushing (writing out to disk). These are swapped (ie flushing begins) based on memory used by the in-memory log trees and time since we last flushed something. The flushing facility monitors the amount of entries coming in and being flushed out, and calibrates itself to try to flush enough each txg to keep up with the ingest rate without competing too much with other IO. Multiple tuneables are provided to control the flushing facility. All the histograms and stats are update to accomodate the log as a separate entry store. zdb gains knowledge of how to count them and dump them. Documentation included! Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15895
2026-05-22 02:27:36 +03:00 · 2023-06-22 17:46:22 +10:00
parent cbb9ef0a4c
commit cd69ba3d49
17 changed files with 1621 additions and 131 deletions
@@ -125,6 +125,28 @@
 * without which, no space would be recovered and the DDT would continue to be
 * considered "over quota". See zap_shrink_enabled.
 *
+ * ## Dedup log
+ *
+ * Historically, all entries modified on a txg were written back to dedup
+ * storage objects at the end of every txg. This could cause significant
+ * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
+ * and so required reading the whole node, updating the entry, and writing it
+ * back. On busy pools, this could add serious IO and memory overheads.
+ *
+ * To address this, the dedup log was added. If the "fast_dedup" feature is
+ * enabled, at the end of each txg, modified entries will be copied to an
+ * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
+ * same block is requested again, the in-memory object will be checked first,
+ * and if its there, the entry inflated back onto the live tree without going
+ * to storage. The on-disk log is only read at pool import time, to reload the
+ * in-memory log.
+ *
+ * Each txg, some amount of the in-memory log will be flushed out to a DDT
+ * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
+ * keep up with the rate of change on dedup entries, but not so much that it
+ * would impact overall throughput, and not using too much memory. See the
+ * zfs_dedup_log_* tuneables in zfs(4) for more details.
+ *
 * ## Repair IO
 *
 * If a read on a dedup block fails, but there are other copies of the block in
@@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0;
 uint_t dedup_class_wait_txgs = 5;


+/*
+ * Don't do more than this many incremental flush passes per txg.
+ */
+uint_t zfs_dedup_log_flush_passes_max = 8;
+
+/*
+ * Minimum time to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_min_time_ms = 1000;
+
+/*
+ * Minimum entries to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_entries_min = 1000;
+
+/*
+ * Number of txgs to average flow rates across.
+ */
+uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
+
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
@@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
 */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };

 /* Dummy version to signal that configure is still necessary */
@@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)

 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));

 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
+	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
 }

 static int
@@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 }

 uint64_t
-ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
 {
 	uint64_t refcnt = 0;

-	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
-		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
-	} else {
-		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
-			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
-	}
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		refcnt = ddp->ddp_flat.ddp_refcnt;
+	else
+		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
+			refcnt += ddp->ddp_trad[v].ddp_refcnt;

 	return (refcnt);
 }
@@ -743,11 +784,15 @@ ddt_init(void)
 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	ddt_log_init();
 }

 void
 ddt_fini(void)
 {
+	ddt_log_fini();
+
 	kmem_cache_destroy(ddt_entry_trad_cache);
 	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
@@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));

+	/* Entry is still in the log, so charge the entry back to it */
+	if (dde->dde_flags & DDE_FLAG_LOGGED) {
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(ddt, dde);
 }
@@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)

 	avl_insert(&ddt->ddt_tree, dde, where);

+	/* If its in the log tree, we can "load" it from there */
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		ddt_lightweight_entry_t ddlwe;
+
+		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
+		    &search, &ddlwe) ||
+		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
+
+			dde->dde_type = ddlwe.ddlwe_type;
+			dde->dde_class = ddlwe.ddlwe_class;
+			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
+			    DDT_PHYS_SIZE(ddt));
+
+			return (dde);
+		}
+	}
+
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
@@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
 		/*
-		 * The histograms only track inactive (stored) blocks.
+		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
 		 * remove its counts. When its synced back, it'll be re-added
 		 * to the right one.
+		 *
+		 * We only do this when we successfully found it in the store.
+		 * error == ENOENT means this is a new entry, and so its already
+		 * not counted.
 		 */
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
 		}
 	}

+	ddt_log_destroy(ddt, tx);
+
 	uint64_t count;
 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
@@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)

 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
-
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;

+	ddt_log_alloc(ddt);
+
 	return (ddt);
 }

 static void
 ddt_table_free(ddt_t *ddt)
 {
+	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
@@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa)
 			}
 		}

+		error = ddt_log_load(ddt);
+		if (error != 0 && error != ENOENT)
+			return (error);
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1483,109 +1567,15 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 }

 static void
-ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
 {
-	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_key_t *ddk = &dde->dde_key;
-	ddt_type_t otype = dde->dde_type;
-	ddt_type_t ntype = DDT_TYPE_DEFAULT;
-	ddt_class_t oclass = dde->dde_class;
-	ddt_class_t nclass;
-	uint64_t total_refcnt = 0;
-
-	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
-
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT(dde->dde_io == NULL ||
-		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_univ_phys_t *ddp = dde->dde_phys;
-		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
-		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
-
-		if (ddt_phys_birth(ddp, v) == 0) {
-			ASSERT0(phys_refcnt);
-			continue;
-		}
-		if (DDT_PHYS_IS_DITTO(ddt, p)) {
-			/*
-			 * Note, we no longer create DDT-DITTO blocks, but we
-			 * don't want to leak any written by older software.
-			 */
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-			continue;
-		}
-		if (phys_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-		total_refcnt += phys_refcnt;
-	}
-
-	if (total_refcnt > 1)
-		nclass = DDT_CLASS_DUPLICATE;
-	else
-		nclass = DDT_CLASS_UNIQUE;
-
-	if (otype != DDT_TYPES &&
-	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
-		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
-		ASSERT3U(
-		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
-	}
-
-	if (total_refcnt != 0) {
-		dde->dde_type = ntype;
-		dde->dde_class = nclass;
-
-		if (!ddt_object_exists(ddt, ntype, nclass))
-			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
-
-		ddt_lightweight_entry_t ddlwe;
-		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
-
-		ddt_histogram_t *ddh =
-		    &ddt->ddt_histogram[ntype][nclass];
-		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
-
-		/*
-		 * If the class changes, the order that we scan this bp
-		 * changes.  If it decreases, we could miss it, so
-		 * scan it right now.  (This covers both class changing
-		 * while we are doing ddt_walk(), and when we are
-		 * traversing.)
-		 */
-		if (nclass < oclass) {
-			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, ddt, &ddlwe, tx);
-		}
-	}
-}
-
-static void
-ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde;
-	void *cookie = NULL;
-
-	if (avl_numnodes(&ddt->ddt_tree) == 0)
-		return;
-
-	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
-
-	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, tx);
-	}
-
-	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
-		ddt_create_dir(ddt, tx);
-
-	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
-		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(ddt, dde);
-	}
-
+	/*
+	 * Count all the entries stored for each type/class, and updates the
+	 * stats within (ddt_object_sync()). If there's no entries for the
+	 * type/class, the whole object is removed. If all objects for the DDT
+	 * are removed, its containing dir is removed, effectively resetting
+	 * the entire DDT to an empty slate.
+	 */
 	uint64_t count = 0;
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		uint64_t add, tcount = 0;
@@ -1604,6 +1594,12 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 		count += tcount;
 	}

+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		/* Include logged entries in the total count */
+		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
+		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	}
+
 	if (count == 0) {
 		/*
 		 * No entries left on the DDT, so reset the version for next
@@ -1620,8 +1616,398 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)

 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
-	spa->spa_dedup_dspace = ~0ULL;
-	spa->spa_dedup_dsize = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+}
+
+static void
+ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+
+	/*
+	 * Compute the target class, so we can decide whether or not to inform
+	 * the scrub traversal (below). Note that we don't store this in the
+	 * entry, as it might change multiple times before finally being
+	 * committed (if we're logging). Instead, we recompute it in
+	 * ddt_sync_entry().
+	 */
+	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If the class changes, the order that we scan this bp changes. If it
+	 * decreases, we could miss it, so scan it right now. (This covers both
+	 * class changing while we are doing ddt_walk(), and when we are
+	 * traversing.)
+	 *
+	 * We also do this when the refcnt goes to zero, because that change is
+	 * only in the log so far; the blocks on disk won't be freed until
+	 * the log is flushed, and the refcnt might increase before that. If it
+	 * does, then we could miss it in the same way.
+	 */
+	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
+		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
+		    ddlwe, tx);
+}
+
+static void
+ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
+{
+	ddt_key_t *ddk = &ddlwe->ddlwe_key;
+	ddt_type_t ntype = DDT_TYPE_DEFAULT;
+	uint64_t refcnt = 0;
+
+	/*
+	 * Compute the total refcnt. Along the way, issue frees for any DVAs
+	 * we no longer want.
+	 */
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
+
+		if (ddt_phys_birth(ddp, v) == 0) {
+			ASSERT3U(phys_refcnt, ==, 0);
+			continue;
+		}
+		if (DDT_PHYS_IS_DITTO(ddt, p)) {
+			/*
+			 * We don't want to keep any obsolete slots (eg ditto),
+			 * regardless of their refcount, but we don't want to
+			 * leak them either. So, free them.
+			 */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+			continue;
+		}
+		if (phys_refcnt == 0)
+			/* No remaining references, free it! */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+		refcnt += phys_refcnt;
+	}
+
+	/* Select the best class for the entry. */
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If an existing entry changed type or class, or its refcount reached
+	 * zero, delete it from the DDT object
+	 */
+	if (otype != DDT_TYPES &&
+	    (otype != ntype || oclass != nclass || refcnt == 0)) {
+		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
+		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
+	}
+
+	/*
+	 * Add or update the entry
+	 */
+	if (refcnt != 0) {
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+
+		ddt_histogram_add_entry(ddt, ddh, ddlwe);
+
+		if (!ddt_object_exists(ddt, ntype, nclass))
+			ddt_object_create(ddt, ntype, nclass, tx);
+		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
+	}
+}
+
+/* Calculate an exponential weighted moving average, lower limited to zero */
+static inline int32_t
+_ewma(int32_t val, int32_t prev, uint32_t weight)
+{
+	ASSERT3U(val, >=, 0);
+	ASSERT3U(prev, >=, 0);
+	const int32_t new =
+	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
+	ASSERT3U(new, >=, 0);
+	return (new);
+}
+
+/* Returns true if done for this txg */
+static boolean_t
+ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (ddt->ddt_flush_pass == 0) {
+		if (spa_sync_pass(ddt->ddt_spa) == 1) {
+			/* First run this txg, get set up */
+			ddt->ddt_flush_start = gethrtime();
+			ddt->ddt_flush_count = 0;
+
+			/*
+			 * How many entries we need to flush. We want to at
+			 * least match the ingest rate.
+			 */
+			ddt->ddt_flush_min = MAX(
+			    ddt->ddt_log_ingest_rate,
+			    zfs_dedup_log_flush_entries_min);
+		} else {
+			/* We already decided we're done for this txg */
+			return (B_FALSE);
+		}
+	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
+		/*
+		 * We already did some flushing on this pass, skip it. This
+		 * happens when dsl_process_async_destroys() runs during a scan
+		 * (on pass 1) and does an additional ddt_sync() to update
+		 * freed blocks.
+		 */
+		return (B_FALSE);
+	}
+
+	if (spa_sync_pass(ddt->ddt_spa) >
+	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
+		/* Too many passes this txg, defer until next. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* Nothing to flush, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
+	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
+
+	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
+
+	if (elapsed_time >= target_time) {
+		/* Too long since we started, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	ddt->ddt_flush_pass++;
+	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
+
+	/*
+	 * Estimate how much time we'll need to flush the remaining entries
+	 * based on how long it normally takes.
+	 */
+	uint32_t want_time;
+	if (ddt->ddt_flush_pass == 1) {
+		/* First pass, use the average time/entries */
+		if (ddt->ddt_log_flush_rate == 0)
+			/* Zero rate, just assume the whole time */
+			want_time = target_time;
+		else
+			want_time = ddt->ddt_flush_min *
+			    ddt->ddt_log_flush_time_rate /
+			    ddt->ddt_log_flush_rate;
+	} else {
+		/* Later pass, calculate from this txg so far */
+		want_time = ddt->ddt_flush_min *
+		    elapsed_time / ddt->ddt_flush_count;
+	}
+
+	/* Figure out how much time we have left */
+	uint32_t remain_time = target_time - elapsed_time;
+
+	/* Smear the remaining entries over the remaining passes. */
+	uint32_t nentries = ddt->ddt_flush_min /
+	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
+	if (want_time > remain_time) {
+		/*
+		 * We're behind; try to catch up a bit by doubling the amount
+		 * this pass. If we're behind that means we're in a later
+		 * pass and likely have most of the remaining time to
+		 * ourselves. If we're in the last couple of passes, then
+		 * doubling might just take us over the timeout, but probably
+		 * not be much, and it stops us falling behind. If we're
+		 * in the middle passes, there'll be more to do, but it
+		 * might just help us catch up a bit and we'll recalculate on
+		 * the next pass anyway.
+		 */
+		nentries = MIN(ddt->ddt_flush_min, nentries*2);
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	uint32_t count = 0;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End this pass if we've synced as much as we need to. */
+		if (++count >= nentries)
+			break;
+	}
+	ddt->ddt_flush_count += count;
+	ddt->ddt_flush_min -= count;
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		ddt_log_truncate(ddt, tx);
+		/* No more passes needed this txg */
+		ddt->ddt_flush_pass = 0;
+	} else
+		/* More to do next time, save checkpoint */
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+
+	ddt_sync_update_stats(ddt, tx);
+
+	return (ddt->ddt_flush_pass == 0);
+}
+
+static void
+ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT(avl_is_empty(&ddt->ddt_tree));
+
+	/* Don't do any flushing when the pool is ready to shut down */
+	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
+		return;
+
+	/* Try to flush some. */
+	if (!ddt_sync_flush_log_incremental(ddt, tx))
+		/* More to do next time */
+		return;
+
+	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
+		/*
+		 * No more to flush, and the active list has stuff, so
+		 * try to swap the logs for next time.
+		 */
+		(void) ddt_log_swap(ddt, tx);
+	}
+
+	/*
+	 * Update flush rate. This is an exponential weighted moving average of
+	 * the number of entries flushed over recent txgs.
+	 */
+	ddt->ddt_log_flush_rate = _ewma(
+	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	    zfs_dedup_log_flush_flow_rate_txgs);
+
+	/*
+	 * Update flush time rate. This is an exponential weighted moving
+	 * average of the total time taken to flush over recent txgs.
+	 */
+	ddt->ddt_log_flush_time_rate = _ewma(
+	    ddt->ddt_log_flush_time_rate,
+	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	    zfs_dedup_log_flush_flow_rate_txgs);
+}
+
+static void
+ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	uint64_t count = avl_numnodes(&ddt->ddt_tree);
+
+	if (count > 0) {
+		ddt_log_update_t dlu = {0};
+		ddt_log_begin(ddt, count, tx, &dlu);
+
+		ddt_entry_t *dde;
+		void *cookie = NULL;
+		ddt_lightweight_entry_t ddlwe;
+		while ((dde =
+		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+			ddt_log_entry(ddt, &ddlwe, &dlu);
+			ddt_sync_scan_entry(ddt, &ddlwe, tx);
+			ddt_free(ddt, dde);
+		}
+
+		ddt_log_commit(ddt, &dlu);
+
+		/*
+		 * Sync the stats for the store objects. Even though we haven't
+		 * modified anything on those objects, they're no longer the
+		 * source of truth for entries that are now in the log, and we
+		 * need the on-disk counts to reflect that, otherwise we'll
+		 * miscount later when importing.
+		 */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0;
+			    class < DDT_CLASSES; class++) {
+				if (ddt_object_exists(ddt, type, class))
+					ddt_object_sync(ddt, type, class, tx);
+			}
+		}
+
+		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+		    sizeof (ddt->ddt_histogram));
+		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	}
+
+	if (spa_sync_pass(ddt->ddt_spa) == 1)
+		/*
+		 * Update ingest rate. This is an exponential weighted moving
+		 * average of the number of entries changed over recent txgs.
+		 * The ramp-up cost shouldn't matter too much because the
+		 * flusher will be trying to take at least the minimum anyway.
+		 */
+		ddt->ddt_log_ingest_rate = _ewma(
+		    count, ddt->ddt_log_ingest_rate,
+		    zfs_dedup_log_flush_flow_rate_txgs);
+}
+
+static void
+ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+	while ((dde = avl_destroy_nodes(
+	    &ddt->ddt_tree, &cookie)) != NULL) {
+		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    dde->dde_type, dde->dde_class, tx);
+		ddt_sync_scan_entry(ddt, &ddlwe, tx);
+		ddt_free(ddt, dde);
+	}
+
+	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+	    sizeof (ddt->ddt_histogram));
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	ddt_sync_update_stats(ddt, tx);
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+
+	if (ddt->ddt_version == UINT64_MAX)
+		return;
+
+	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
+		ASSERT0(avl_numnodes(&ddt->ddt_tree));
+		return;
+	}
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
+	}
+
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG)
+		ddt_sync_table_log(ddt, tx);
+	else
+		ddt_sync_table_flush(ddt, tx);
 }

 void
@@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg)
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
-		ddt_sync_table(ddt, tx, txg);
+		ddt_sync_table(ddt, tx);
+		if (ddt->ddt_flags & DDT_FLAG_LOG)
+			ddt_sync_flush_log(ddt, tx);
 		ddt_repair_table(ddt, rio);
 	}

@@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		return (B_FALSE);
 	}

-	if (dde->dde_type < DDT_TYPES) {
-		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
-
+	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
+		/*
+		 * This entry was either synced to a store object (dde_type is
+		 * real) or was logged. It must be properly on disk at this
+		 * point, so we can just bump its refcount.
+		 */
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

@@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * we may have a block with the DEDUP set, but which doesn't
 		 * have a corresponding entry in the DDT. Be ready.
 		 */
-		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
@@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)

 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
+	"Max number of incremental dedup log flush passes per transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
+	"Min time to spend on incremental dedup log flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
+	"Min number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
+	"Number of txgs to average flow rates across");
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time.
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	if (!(too_large || too_old))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
@@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,

 	memset(dds, 0, sizeof (*dds));

-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

@@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}

 	/*
@@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }