ddt: dedup log

Adds a log/journal to dedup. At the end of txg, instead of writing the entry directly to the ZAP, instead its adding to an in-memory tree and appended to an on-disk object. The on-disk object is only read at import, to reload the in-memory tree. Lookups first go the the log tree before going to the ZAP, so recently-used entries will remain close by in memory. This vastly reduces overhead from dedup IO, as it will not have to do so many read/update/write cycles on ZAP leaf nodes. A flushing facility is added at end of txg, to push logged entries out to the ZAP. There's actually two separate "logs" (in-memory tree and on-disk object), one active (recieving updated entries) and one flushing (writing out to disk). These are swapped (ie flushing begins) based on memory used by the in-memory log trees and time since we last flushed something. The flushing facility monitors the amount of entries coming in and being flushed out, and calibrates itself to try to flush enough each txg to keep up with the ingest rate without competing too much with other IO. Multiple tuneables are provided to control the flushing facility. All the histograms and stats are update to accomodate the log as a separate entry store. zdb gains knowledge of how to count them and dump them. Documentation included! Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15895
2026-05-23 02:44:41 +03:00 · 2023-06-22 17:46:22 +10:00
parent cbb9ef0a4c
commit cd69ba3d49
17 changed files with 1621 additions and 131 deletions
@@ -1958,6 +1958,32 @@ dump_dedup_ratio(const ddt_stat_t *dds)
 	    dedup, compress, copies, dedup * compress / copies);
 }

+static void
+dump_ddt_log(ddt_t *ddt)
+{
+	for (int n = 0; n < 2; n++) {
+		ddt_log_t *ddl = &ddt->ddt_log[n];
+
+		uint64_t count = avl_numnodes(&ddl->ddl_tree);
+		if (count == 0)
+			continue;
+
+		printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
+		    zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
+
+		if (dump_opt['D'] < 4)
+			continue;
+
+		ddt_lightweight_entry_t ddlwe;
+		uint64_t index = 0;
+		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
+			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+			dump_ddt_entry(ddt, &ddlwe, index++);
+		}
+	}
+}
+
 static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
@@ -2027,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
 				dump_ddt(ddt, type, class);
 			}
 		}
+		dump_ddt_log(ddt);
 	}

 	ddt_get_dedup_stats(spa, &dds_total);
@@ -5743,7 +5770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));

 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
 		ddt_phys_decref(dde->dde_phys, v);

 		/*
@@ -8120,6 +8147,10 @@ dump_mos_leaks(spa_t *spa)

 		/* FDT container */
 		mos_obj_refd(ddt->ddt_dir_object);
+
+		/* FDT log objects */
+		mos_obj_refd(ddt->ddt_log[0].ddl_object);
+		mos_obj_refd(ddt->ddt_log[1].ddl_object);
 	}

 	if (spa->spa_brt != NULL) {
@@ -43,7 +43,8 @@ struct abd;
 * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
 */
 #define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
-#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT)
+#define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)

 /*
 * DDT on-disk storage object types. Each one corresponds to specific
@@ -209,6 +210,7 @@ typedef enum {
 /* State flags for dde_flags */
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
+#define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */

 /*
 * Additional data to support entry update or repair. This is fixed size
@@ -254,6 +256,19 @@ typedef struct {
 	ddt_univ_phys_t	ddlwe_phys;
 } ddt_lightweight_entry_t;

+/*
+ * In-core DDT log. A separate struct to make it easier to switch between the
+ * appending and flushing logs.
+ */
+typedef struct {
+	avl_tree_t	ddl_tree;	/* logged entries */
+	uint32_t	ddl_flags;	/* flags for this log */
+	uint64_t	ddl_object;	/* log object id */
+	uint64_t	ddl_length;	/* on-disk log size */
+	uint64_t	ddl_first_txg;	/* txg log became active */
+	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
+} ddt_log_t;
+
 /*
 * In-core DDT object. This covers all entries and stats for a the whole pool
 * for a given checksum type.
@@ -262,8 +277,22 @@ typedef struct {
 	kmutex_t	ddt_lock;	/* protects changes to all fields */

 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
+	avl_tree_t	ddt_log_tree;	/* logged entries */

-	avl_tree_t	ddt_repair_tree; /* entries being repaired */
+	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
+
+	ddt_log_t	ddt_log[2];		/* active/flushing logs */
+	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
+	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
+
+	hrtime_t	ddt_flush_start;	/* log flush start this txg */
+	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
+
+	int32_t		ddt_flush_count;	/* entries flushed this txg */
+	int32_t		ddt_flush_min;		/* min rem entries to flush */
+	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
+	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
+	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */

 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
@@ -276,13 +305,17 @@ typedef struct {
 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];

-	/* object ids for whole-ddt and per-type/per-class stats */
+	/* object ids for stored, logged and per-type/per-class stats */
 	uint64_t	ddt_stat_object;
+	ddt_object_t	ddt_log_stats;
 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];

 	/* type/class stats by power-2-sized referenced blocks */
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+
+	/* log stats power-2-sized referenced blocks */
+	ddt_histogram_t	ddt_log_histogram;
 } ddt_t;

 /*
@@ -28,6 +28,7 @@
 #define	_SYS_DDT_IMPL_H

 #include <sys/ddt.h>
+#include <sys/bitops.h>

 #ifdef	__cplusplus
 extern "C" {
@@ -50,6 +51,106 @@ extern "C" {
 	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
 } while (0)

+#define	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do {             \
+	memset((ddlwe), 0, sizeof (*ddlwe));                            \
+	(ddlwe)->ddlwe_key = (ddle)->ddle_key;                          \
+	(ddlwe)->ddlwe_type = (ddle)->ddle_type;                        \
+	(ddlwe)->ddlwe_class = (ddle)->ddle_class;                      \
+	memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+/*
+ * An entry on the log tree. These are "frozen", and a record of what's in
+ * the on-disk log. They can't be used in place, but can be "loaded" back into
+ * the live tree.
+ */
+typedef struct {
+	ddt_key_t	ddle_key;	/* ddt_log_tree key */
+	avl_node_t	ddle_node;	/* ddt_log_tree node */
+
+	ddt_type_t	ddle_type;	/* storage type */
+	ddt_class_t	ddle_class;	/* storage class */
+
+	/* extra allocation for flat/trad phys */
+	ddt_univ_phys_t	ddle_phys[];
+} ddt_log_entry_t;
+
+/* On-disk log record types. */
+typedef enum {
+	DLR_INVALID	= 0,	/* end of block marker */
+	DLR_ENTRY	= 1,	/* an entry to add or replace in the log tree */
+} ddt_log_record_type_t;
+
+/* On-disk log record header. */
+typedef struct {
+	/*
+	 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:    record type (ddt_log_record_type_t)
+	 * bits 8-15:  length of record header+payload
+	 * bits 16-47:  reserved, all zero
+	 * bits 48-55:   if type==DLR_ENTRY, storage type (ddt_type)
+	 *                otherwise all zero
+	 * bits 56-63:  if type==DLR_ENTRY, storage class (ddt_class)
+	 *                otherwise all zero
+	 */
+	uint64_t	dlr_info;
+	uint8_t		dlr_payload[];
+} ddt_log_record_t;
+
+#define	DLR_GET_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 0, 8)
+#define	DLR_SET_TYPE(dlr, v)		BF64_SET((dlr)->dlr_info, 0, 8, v)
+#define	DLR_GET_RECLEN(dlr)		BF64_GET((dlr)->dlr_info, 8, 16)
+#define	DLR_SET_RECLEN(dlr, v)		BF64_SET((dlr)->dlr_info, 8, 16, v)
+#define	DLR_GET_ENTRY_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 48, 8)
+#define	DLR_SET_ENTRY_TYPE(dlr, v)	BF64_SET((dlr)->dlr_info, 48, 8, v)
+#define	DLR_GET_ENTRY_CLASS(dlr)	BF64_GET((dlr)->dlr_info, 56, 8)
+#define	DLR_SET_ENTRY_CLASS(dlr, v)	BF64_SET((dlr)->dlr_info, 56, 8, v)
+
+/* Payload for DLR_ENTRY. */
+typedef struct {
+	ddt_key_t	dlre_key;
+	ddt_univ_phys_t	dlre_phys[];
+} ddt_log_record_entry_t;
+
+/* Log flags (ddl_flags, dlh_flags) */
+#define	DDL_FLAG_FLUSHING	(1 << 0)	/* this log is being flushed */
+#define	DDL_FLAG_CHECKPOINT	(1 << 1)	/* header has a checkpoint */
+
+/* On-disk log header, stored in the bonus buffer. */
+typedef struct {
+	/*
+	 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:   log version
+	 * bits 8-15:  log flags
+	 * bits 16-63: reserved, all zero
+	 */
+	uint64_t	dlh_info;
+
+	uint64_t	dlh_length;	/* log size in bytes */
+	uint64_t	dlh_first_txg;	/* txg this log went active */
+	ddt_key_t	dlh_checkpoint;	/* last checkpoint */
+} ddt_log_header_t;
+
+#define	DLH_GET_VERSION(dlh)	BF64_GET((dlh)->dlh_info, 0, 8)
+#define	DLH_SET_VERSION(dlh, v)	BF64_SET((dlh)->dlh_info, 0, 8, v)
+#define	DLH_GET_FLAGS(dlh)	BF64_GET((dlh)->dlh_info, 8, 8)
+#define	DLH_SET_FLAGS(dlh, v)	BF64_SET((dlh)->dlh_info, 8, 8, v)
+
+/* DDT log update state */
+typedef struct {
+	dmu_tx_t	*dlu_tx;	/* tx the update is being applied to */
+	dnode_t		*dlu_dn;	/* log object dnode */
+	dmu_buf_t	**dlu_dbp;	/* array of block buffer pointers */
+	int		dlu_ndbp;	/* number of block buffer pointers */
+	uint16_t	dlu_reclen;	/* cached length of record */
+	uint64_t	dlu_block;	/* block for next entry */
+	uint64_t	dlu_offset;	/* offset for next entry */
+} ddt_log_update_t;
+
 /*
 * Ops vector to access a specific DDT object type.
 */
@@ -77,6 +178,33 @@ typedef struct {

 extern const ddt_ops_t ddt_zap_ops;

+/* Dedup log API */
+extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
+    ddt_log_update_t *dlu);
+extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
+    ddt_log_update_t *dlu);
+extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
+
+extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    dmu_tx_t *tx);
+extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
+
+extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
+
+extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
+
+extern int ddt_log_load(ddt_t *ddt);
+extern void ddt_log_alloc(ddt_t *ddt);
+extern void ddt_log_free(ddt_t *ddt);
+
+extern void ddt_log_init(void);
+extern void ddt_log_fini(void);
+
 /*
 * These are only exposed so that zdb can access them. Try not to use them
 * outside of the DDT implementation proper, and if you do, consider moving
@@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops;
 */
 #define	DDT_NAMELEN	32

-extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
+    const ddt_univ_phys_t *ddp);

 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);

@@ -375,6 +375,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
@@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/dbuf.c \
 	module/zfs/dbuf_stats.c \
 	module/zfs/ddt.c \
+	module/zfs/ddt_log.c \
 	module/zfs/ddt_stats.c \
 	module/zfs/ddt_zap.c \
 	module/zfs/dmu.c \
@@ -974,6 +974,88 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
+.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
+Maximum number of dedup log flush passes (iterations) each transaction.
+.Pp
+At the start of each transaction, OpenZFS will estimate how many entries it
+needs to flush out to keep up with the change rate, taking the amount and time
+taken to flush on previous txgs into account (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+It will spread this amount into a number of passes.
+At each pass, it will use the amount already flushed and the total time taken
+by flushing and by other IO to recompute how much it should do for the remainder
+of the txg.
+.Pp
+Reducing the max number of passes will make flushing more aggressive, flushing
+out more entries on each pass.
+This can be faster, but also more likely to compete with other IO.
+Increasing the max number of passes will put fewer entries onto each pass,
+keeping the overhead of dedup changes to a minimum but possibly causing a large
+number of changes to be dumped on the last pass, which can blow out the txg
+sync time beyond
+.Sy zfs_txg_timeout .
+.
+.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
+Minimum time to spend on dedup log flush each transaction.
+.Pp
+At least this long will be spent flushing dedup log entries each transaction,
+up to
+.Sy zfs_txg_timeout .
+This occurs even if doing so would delay the transaction, that is, other IO
+completes under this time.
+.
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+Flush at least this many entries each transaction.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction to
+keep up with the ingest rate (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+This sets the minimum for that estimate.
+Raising it can force OpenZFS to flush more aggressively, keeping the log small
+and so reducing pool import times, but can make it less able to back off if
+log flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
+Number of transactions to use to compute the flow rate.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction by
+monitoring the number of entries changed (ingest rate), number of entries
+flushed (flush rate) and time spent flushing (flush time rate) and combining
+these into an overall "flow rate".
+It will use an exponential weighted moving average over some number of recent
+transactions to compute these rates.
+This sets the number of transactions to compute these averages over.
+Setting it higher can help to smooth out the flow rate in the face of spiky
+workloads, but will take longer for the flow rate to adjust to a sustained
+change in the ingress rate.
+.
+.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
+Max transactions to before starting to flush dedup logs.
+.Pp
+OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
+If there is nothing to flush, it will accumulate changes for no more than this
+many transactions before switching the logs and starting to flush entries out.
+.
+.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
+Max memory to use for dedup logs.
+.Pp
+OpenZFS will spend no more than this much memory on maintaining the in-memory
+dedup log.
+Flushing will begin when around half this amount is being spent on logs.
+The default value of
+.Sy 0
+will cause it to be set by
+.Sy zfs_dedup_log_mem_max_percent
+instead.
+.
+.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
+Max memory to use for dedup logs, as a percentage of total memory.
+.Pp
+If
+.Sy zfs_dedup_log_mem_max
+is not set, it will be initialised as a percentage of the total memory in the
+system.
+.
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
@@ -322,6 +322,7 @@ ZFS_OBJS := \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
+	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
@@ -252,6 +252,7 @@ SRCS+=	abd.c \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast

 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.ddt_zap.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
@@ -125,6 +125,28 @@
 * without which, no space would be recovered and the DDT would continue to be
 * considered "over quota". See zap_shrink_enabled.
 *
+ * ## Dedup log
+ *
+ * Historically, all entries modified on a txg were written back to dedup
+ * storage objects at the end of every txg. This could cause significant
+ * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
+ * and so required reading the whole node, updating the entry, and writing it
+ * back. On busy pools, this could add serious IO and memory overheads.
+ *
+ * To address this, the dedup log was added. If the "fast_dedup" feature is
+ * enabled, at the end of each txg, modified entries will be copied to an
+ * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
+ * same block is requested again, the in-memory object will be checked first,
+ * and if its there, the entry inflated back onto the live tree without going
+ * to storage. The on-disk log is only read at pool import time, to reload the
+ * in-memory log.
+ *
+ * Each txg, some amount of the in-memory log will be flushed out to a DDT
+ * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
+ * keep up with the rate of change on dedup entries, but not so much that it
+ * would impact overall throughput, and not using too much memory. See the
+ * zfs_dedup_log_* tuneables in zfs(4) for more details.
+ *
 * ## Repair IO
 *
 * If a read on a dedup block fails, but there are other copies of the block in
@@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0;
 uint_t dedup_class_wait_txgs = 5;


+/*
+ * Don't do more than this many incremental flush passes per txg.
+ */
+uint_t zfs_dedup_log_flush_passes_max = 8;
+
+/*
+ * Minimum time to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_min_time_ms = 1000;
+
+/*
+ * Minimum entries to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_entries_min = 1000;
+
+/*
+ * Number of txgs to average flow rates across.
+ */
+uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
+
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
@@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
 */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };

 /* Dummy version to signal that configure is still necessary */
@@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)

 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));

 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
+	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
 }

 static int
@@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 }

 uint64_t
-ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
 {
 	uint64_t refcnt = 0;

-	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
-		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
-	} else {
-		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
-			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
-	}
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		refcnt = ddp->ddp_flat.ddp_refcnt;
+	else
+		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
+			refcnt += ddp->ddp_trad[v].ddp_refcnt;

 	return (refcnt);
 }
@@ -743,11 +784,15 @@ ddt_init(void)
 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	ddt_log_init();
 }

 void
 ddt_fini(void)
 {
+	ddt_log_fini();
+
 	kmem_cache_destroy(ddt_entry_trad_cache);
 	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
@@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));

+	/* Entry is still in the log, so charge the entry back to it */
+	if (dde->dde_flags & DDE_FLAG_LOGGED) {
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(ddt, dde);
 }
@@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)

 	avl_insert(&ddt->ddt_tree, dde, where);

+	/* If its in the log tree, we can "load" it from there */
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		ddt_lightweight_entry_t ddlwe;
+
+		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
+		    &search, &ddlwe) ||
+		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
+
+			dde->dde_type = ddlwe.ddlwe_type;
+			dde->dde_class = ddlwe.ddlwe_class;
+			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
+			    DDT_PHYS_SIZE(ddt));
+
+			return (dde);
+		}
+	}
+
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
@@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
 		/*
-		 * The histograms only track inactive (stored) blocks.
+		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
 		 * remove its counts. When its synced back, it'll be re-added
 		 * to the right one.
+		 *
+		 * We only do this when we successfully found it in the store.
+		 * error == ENOENT means this is a new entry, and so its already
+		 * not counted.
 		 */
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
 		}
 	}

+	ddt_log_destroy(ddt, tx);
+
 	uint64_t count;
 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
@@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)

 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
-
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;

+	ddt_log_alloc(ddt);
+
 	return (ddt);
 }

 static void
 ddt_table_free(ddt_t *ddt)
 {
+	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
@@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa)
 			}
 		}

+		error = ddt_log_load(ddt);
+		if (error != 0 && error != ENOENT)
+			return (error);
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1483,109 +1567,15 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 }

 static void
-ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
 {
-	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_key_t *ddk = &dde->dde_key;
-	ddt_type_t otype = dde->dde_type;
-	ddt_type_t ntype = DDT_TYPE_DEFAULT;
-	ddt_class_t oclass = dde->dde_class;
-	ddt_class_t nclass;
-	uint64_t total_refcnt = 0;
-
-	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
-
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT(dde->dde_io == NULL ||
-		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_univ_phys_t *ddp = dde->dde_phys;
-		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
-		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
-
-		if (ddt_phys_birth(ddp, v) == 0) {
-			ASSERT0(phys_refcnt);
-			continue;
-		}
-		if (DDT_PHYS_IS_DITTO(ddt, p)) {
-			/*
-			 * Note, we no longer create DDT-DITTO blocks, but we
-			 * don't want to leak any written by older software.
-			 */
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-			continue;
-		}
-		if (phys_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-		total_refcnt += phys_refcnt;
-	}
-
-	if (total_refcnt > 1)
-		nclass = DDT_CLASS_DUPLICATE;
-	else
-		nclass = DDT_CLASS_UNIQUE;
-
-	if (otype != DDT_TYPES &&
-	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
-		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
-		ASSERT3U(
-		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
-	}
-
-	if (total_refcnt != 0) {
-		dde->dde_type = ntype;
-		dde->dde_class = nclass;
-
-		if (!ddt_object_exists(ddt, ntype, nclass))
-			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
-
-		ddt_lightweight_entry_t ddlwe;
-		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
-
-		ddt_histogram_t *ddh =
-		    &ddt->ddt_histogram[ntype][nclass];
-		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
-
-		/*
-		 * If the class changes, the order that we scan this bp
-		 * changes.  If it decreases, we could miss it, so
-		 * scan it right now.  (This covers both class changing
-		 * while we are doing ddt_walk(), and when we are
-		 * traversing.)
-		 */
-		if (nclass < oclass) {
-			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, ddt, &ddlwe, tx);
-		}
-	}
-}
-
-static void
-ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde;
-	void *cookie = NULL;
-
-	if (avl_numnodes(&ddt->ddt_tree) == 0)
-		return;
-
-	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
-
-	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, tx);
-	}
-
-	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
-		ddt_create_dir(ddt, tx);
-
-	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
-		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(ddt, dde);
-	}
-
+	/*
+	 * Count all the entries stored for each type/class, and updates the
+	 * stats within (ddt_object_sync()). If there's no entries for the
+	 * type/class, the whole object is removed. If all objects for the DDT
+	 * are removed, its containing dir is removed, effectively resetting
+	 * the entire DDT to an empty slate.
+	 */
 	uint64_t count = 0;
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		uint64_t add, tcount = 0;
@@ -1604,6 +1594,12 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 		count += tcount;
 	}

+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		/* Include logged entries in the total count */
+		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
+		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	}
+
 	if (count == 0) {
 		/*
 		 * No entries left on the DDT, so reset the version for next
@@ -1620,8 +1616,398 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)

 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
-	spa->spa_dedup_dspace = ~0ULL;
-	spa->spa_dedup_dsize = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+}
+
+static void
+ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+
+	/*
+	 * Compute the target class, so we can decide whether or not to inform
+	 * the scrub traversal (below). Note that we don't store this in the
+	 * entry, as it might change multiple times before finally being
+	 * committed (if we're logging). Instead, we recompute it in
+	 * ddt_sync_entry().
+	 */
+	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If the class changes, the order that we scan this bp changes. If it
+	 * decreases, we could miss it, so scan it right now. (This covers both
+	 * class changing while we are doing ddt_walk(), and when we are
+	 * traversing.)
+	 *
+	 * We also do this when the refcnt goes to zero, because that change is
+	 * only in the log so far; the blocks on disk won't be freed until
+	 * the log is flushed, and the refcnt might increase before that. If it
+	 * does, then we could miss it in the same way.
+	 */
+	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
+		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
+		    ddlwe, tx);
+}
+
+static void
+ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
+{
+	ddt_key_t *ddk = &ddlwe->ddlwe_key;
+	ddt_type_t ntype = DDT_TYPE_DEFAULT;
+	uint64_t refcnt = 0;
+
+	/*
+	 * Compute the total refcnt. Along the way, issue frees for any DVAs
+	 * we no longer want.
+	 */
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
+
+		if (ddt_phys_birth(ddp, v) == 0) {
+			ASSERT3U(phys_refcnt, ==, 0);
+			continue;
+		}
+		if (DDT_PHYS_IS_DITTO(ddt, p)) {
+			/*
+			 * We don't want to keep any obsolete slots (eg ditto),
+			 * regardless of their refcount, but we don't want to
+			 * leak them either. So, free them.
+			 */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+			continue;
+		}
+		if (phys_refcnt == 0)
+			/* No remaining references, free it! */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+		refcnt += phys_refcnt;
+	}
+
+	/* Select the best class for the entry. */
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If an existing entry changed type or class, or its refcount reached
+	 * zero, delete it from the DDT object
+	 */
+	if (otype != DDT_TYPES &&
+	    (otype != ntype || oclass != nclass || refcnt == 0)) {
+		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
+		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
+	}
+
+	/*
+	 * Add or update the entry
+	 */
+	if (refcnt != 0) {
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+
+		ddt_histogram_add_entry(ddt, ddh, ddlwe);
+
+		if (!ddt_object_exists(ddt, ntype, nclass))
+			ddt_object_create(ddt, ntype, nclass, tx);
+		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
+	}
+}
+
+/* Calculate an exponential weighted moving average, lower limited to zero */
+static inline int32_t
+_ewma(int32_t val, int32_t prev, uint32_t weight)
+{
+	ASSERT3U(val, >=, 0);
+	ASSERT3U(prev, >=, 0);
+	const int32_t new =
+	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
+	ASSERT3U(new, >=, 0);
+	return (new);
+}
+
+/* Returns true if done for this txg */
+static boolean_t
+ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (ddt->ddt_flush_pass == 0) {
+		if (spa_sync_pass(ddt->ddt_spa) == 1) {
+			/* First run this txg, get set up */
+			ddt->ddt_flush_start = gethrtime();
+			ddt->ddt_flush_count = 0;
+
+			/*
+			 * How many entries we need to flush. We want to at
+			 * least match the ingest rate.
+			 */
+			ddt->ddt_flush_min = MAX(
+			    ddt->ddt_log_ingest_rate,
+			    zfs_dedup_log_flush_entries_min);
+		} else {
+			/* We already decided we're done for this txg */
+			return (B_FALSE);
+		}
+	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
+		/*
+		 * We already did some flushing on this pass, skip it. This
+		 * happens when dsl_process_async_destroys() runs during a scan
+		 * (on pass 1) and does an additional ddt_sync() to update
+		 * freed blocks.
+		 */
+		return (B_FALSE);
+	}
+
+	if (spa_sync_pass(ddt->ddt_spa) >
+	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
+		/* Too many passes this txg, defer until next. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* Nothing to flush, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
+	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
+
+	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
+
+	if (elapsed_time >= target_time) {
+		/* Too long since we started, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	ddt->ddt_flush_pass++;
+	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
+
+	/*
+	 * Estimate how much time we'll need to flush the remaining entries
+	 * based on how long it normally takes.
+	 */
+	uint32_t want_time;
+	if (ddt->ddt_flush_pass == 1) {
+		/* First pass, use the average time/entries */
+		if (ddt->ddt_log_flush_rate == 0)
+			/* Zero rate, just assume the whole time */
+			want_time = target_time;
+		else
+			want_time = ddt->ddt_flush_min *
+			    ddt->ddt_log_flush_time_rate /
+			    ddt->ddt_log_flush_rate;
+	} else {
+		/* Later pass, calculate from this txg so far */
+		want_time = ddt->ddt_flush_min *
+		    elapsed_time / ddt->ddt_flush_count;
+	}
+
+	/* Figure out how much time we have left */
+	uint32_t remain_time = target_time - elapsed_time;
+
+	/* Smear the remaining entries over the remaining passes. */
+	uint32_t nentries = ddt->ddt_flush_min /
+	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
+	if (want_time > remain_time) {
+		/*
+		 * We're behind; try to catch up a bit by doubling the amount
+		 * this pass. If we're behind that means we're in a later
+		 * pass and likely have most of the remaining time to
+		 * ourselves. If we're in the last couple of passes, then
+		 * doubling might just take us over the timeout, but probably
+		 * not be much, and it stops us falling behind. If we're
+		 * in the middle passes, there'll be more to do, but it
+		 * might just help us catch up a bit and we'll recalculate on
+		 * the next pass anyway.
+		 */
+		nentries = MIN(ddt->ddt_flush_min, nentries*2);
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	uint32_t count = 0;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End this pass if we've synced as much as we need to. */
+		if (++count >= nentries)
+			break;
+	}
+	ddt->ddt_flush_count += count;
+	ddt->ddt_flush_min -= count;
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		ddt_log_truncate(ddt, tx);
+		/* No more passes needed this txg */
+		ddt->ddt_flush_pass = 0;
+	} else
+		/* More to do next time, save checkpoint */
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+
+	ddt_sync_update_stats(ddt, tx);
+
+	return (ddt->ddt_flush_pass == 0);
+}
+
+static void
+ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT(avl_is_empty(&ddt->ddt_tree));
+
+	/* Don't do any flushing when the pool is ready to shut down */
+	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
+		return;
+
+	/* Try to flush some. */
+	if (!ddt_sync_flush_log_incremental(ddt, tx))
+		/* More to do next time */
+		return;
+
+	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
+		/*
+		 * No more to flush, and the active list has stuff, so
+		 * try to swap the logs for next time.
+		 */
+		(void) ddt_log_swap(ddt, tx);
+	}
+
+	/*
+	 * Update flush rate. This is an exponential weighted moving average of
+	 * the number of entries flushed over recent txgs.
+	 */
+	ddt->ddt_log_flush_rate = _ewma(
+	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	    zfs_dedup_log_flush_flow_rate_txgs);
+
+	/*
+	 * Update flush time rate. This is an exponential weighted moving
+	 * average of the total time taken to flush over recent txgs.
+	 */
+	ddt->ddt_log_flush_time_rate = _ewma(
+	    ddt->ddt_log_flush_time_rate,
+	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	    zfs_dedup_log_flush_flow_rate_txgs);
+}
+
+static void
+ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	uint64_t count = avl_numnodes(&ddt->ddt_tree);
+
+	if (count > 0) {
+		ddt_log_update_t dlu = {0};
+		ddt_log_begin(ddt, count, tx, &dlu);
+
+		ddt_entry_t *dde;
+		void *cookie = NULL;
+		ddt_lightweight_entry_t ddlwe;
+		while ((dde =
+		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+			ddt_log_entry(ddt, &ddlwe, &dlu);
+			ddt_sync_scan_entry(ddt, &ddlwe, tx);
+			ddt_free(ddt, dde);
+		}
+
+		ddt_log_commit(ddt, &dlu);
+
+		/*
+		 * Sync the stats for the store objects. Even though we haven't
+		 * modified anything on those objects, they're no longer the
+		 * source of truth for entries that are now in the log, and we
+		 * need the on-disk counts to reflect that, otherwise we'll
+		 * miscount later when importing.
+		 */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0;
+			    class < DDT_CLASSES; class++) {
+				if (ddt_object_exists(ddt, type, class))
+					ddt_object_sync(ddt, type, class, tx);
+			}
+		}
+
+		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+		    sizeof (ddt->ddt_histogram));
+		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	}
+
+	if (spa_sync_pass(ddt->ddt_spa) == 1)
+		/*
+		 * Update ingest rate. This is an exponential weighted moving
+		 * average of the number of entries changed over recent txgs.
+		 * The ramp-up cost shouldn't matter too much because the
+		 * flusher will be trying to take at least the minimum anyway.
+		 */
+		ddt->ddt_log_ingest_rate = _ewma(
+		    count, ddt->ddt_log_ingest_rate,
+		    zfs_dedup_log_flush_flow_rate_txgs);
+}
+
+static void
+ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+	while ((dde = avl_destroy_nodes(
+	    &ddt->ddt_tree, &cookie)) != NULL) {
+		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    dde->dde_type, dde->dde_class, tx);
+		ddt_sync_scan_entry(ddt, &ddlwe, tx);
+		ddt_free(ddt, dde);
+	}
+
+	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+	    sizeof (ddt->ddt_histogram));
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	ddt_sync_update_stats(ddt, tx);
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+
+	if (ddt->ddt_version == UINT64_MAX)
+		return;
+
+	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
+		ASSERT0(avl_numnodes(&ddt->ddt_tree));
+		return;
+	}
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
+	}
+
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG)
+		ddt_sync_table_log(ddt, tx);
+	else
+		ddt_sync_table_flush(ddt, tx);
 }

 void
@@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg)
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
-		ddt_sync_table(ddt, tx, txg);
+		ddt_sync_table(ddt, tx);
+		if (ddt->ddt_flags & DDT_FLAG_LOG)
+			ddt_sync_flush_log(ddt, tx);
 		ddt_repair_table(ddt, rio);
 	}

@@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		return (B_FALSE);
 	}

-	if (dde->dde_type < DDT_TYPES) {
-		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
-
+	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
+		/*
+		 * This entry was either synced to a store object (dde_type is
+		 * real) or was logged. It must be properly on disk at this
+		 * point, so we can just bump its refcount.
+		 */
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

@@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * we may have a block with the DEDUP set, but which doesn't
 		 * have a corresponding entry in the DDT. Be ready.
 		 */
-		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
@@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)

 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
+	"Max number of incremental dedup log flush passes per transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
+	"Min time to spend on incremental dedup log flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
+	"Min number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
+	"Number of txgs to average flow rates across");
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time.
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	if (!(too_large || too_old))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
@@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,

 	memset(dds, 0, sizeof (*dds));

-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

@@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}

 	/*
@@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }

@@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
+DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
@@ -29,9 +29,16 @@

 log_assert "basic dedup (FDT) operations work"

+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }

 log_onexit cleanup
@@ -29,9 +29,16 @@

 log_assert "dedup (FDT) retains version after import"

+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }

 log_onexit cleanup
@@ -30,9 +30,16 @@

 log_assert "legacy and FDT dedup tables on the same pool can happily coexist"

+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }

 log_onexit cleanup
@@ -30,9 +30,16 @@

 log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT"

+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }

 log_onexit cleanup
@@ -51,6 +51,12 @@ POOL="dedup_pool"

 save_tunable TXG_TIMEOUT

+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	if poolexists $POOL ; then
@@ -58,6 +64,7 @@ function cleanup
 	fi
 	log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
 	log_must restore_tunable TXG_TIMEOUT
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }


@@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit

 	#
 	# With no DDT quota in place, the above workload will produce over
-	# 800,000 entries by using space in the normal class. With a quota,
-	# it will be well below 500,000 entries.
+	# 800,000 entries by using space in the normal class. With a quota, it
+	# should be well under 500,000. However, logged entries are hard to
+	# account for because they can appear on both logs, and can also
+	# represent an eventual removal. This isn't easily visible from
+	# outside, and even internally can result in going slightly over quota.
+	# For here, we just set the entry count a little higher than what we
+	# expect to allow for some instability.
 	#
-	log_must test $(ddt_entries) -le 500000
+	log_must test $(ddt_entries) -le 600000

 	do_clean
 }