diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 66d59ceba..02d0cf5da 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -39,6 +39,12 @@ extern "C" {
struct abd;
+/*
+ * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
+ */
+/* No flags yet. */
+#define DDT_FLAG_MASK (0)
+
/*
* DDT on-disk storage object types. Each one corresponds to specific
* implementation, see ddt_ops_t. The value itself is not stored on disk.
@@ -185,11 +191,15 @@ typedef struct {
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
- avl_tree_t ddt_repair_tree; /* entries being repaired */
+ avl_tree_t ddt_repair_tree; /* entries being repaired */
- enum zio_checksum ddt_checksum; /* checksum algorithm in use */
- spa_t *ddt_spa; /* pool this ddt is on */
- objset_t *ddt_os; /* ddt objset (always MOS) */
+ enum zio_checksum ddt_checksum; /* checksum algorithm in use */
+ spa_t *ddt_spa; /* pool this ddt is on */
+ objset_t *ddt_os; /* ddt objset (always MOS) */
+
+ uint64_t ddt_dir_object; /* MOS dir holding ddt objects */
+ uint64_t ddt_version; /* DDT version */
+ uint64_t ddt_flags; /* FDT option flags */
/* per-type/per-class entry store objects */
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 4aaab10c8..9c0fea64f 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -33,6 +33,14 @@
extern "C" {
#endif
+/* DDT version numbers */
+#define DDT_VERSION_LEGACY (0)
+#define DDT_VERSION_FDT (1)
+
+/* Names of interesting objects in the DDT root dir */
+#define DDT_DIR_VERSION "version"
+#define DDT_DIR_FLAGS "flags"
+
/*
* Ops vector to access a specific DDT object type.
*/
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 1376cbef7..5b80dc315 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -376,6 +376,7 @@ typedef struct dmu_buf {
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
#define DMU_POOL_DDT "DDT-%s-%s-%s"
#define DMU_POOL_DDT_STATS "DDT-statistics"
+#define DMU_POOL_DDT_DIR "DDT-%s"
#define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_ERRORSCRUB "error_scrub"
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 2515ba321..5733a8187 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
+ SPA_FEATURE_FAST_DEDUP,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 51c8dc964..88baa4168 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -616,7 +616,7 @@
-
+
@@ -6006,7 +6006,8 @@
-
+
+
@@ -9131,8 +9132,8 @@
-
-
+
+
@@ -9209,7 +9210,7 @@
-
+
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index ea3c68dc6..ff6e485a4 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -17,8 +17,9 @@
.\" Copyright (c) 2019, Klara Inc.
.\" Copyright (c) 2019, Allan Jude
.\" Copyright (c) 2021, Colm Buckley
+.\" Copyright (c) 2023, Klara Inc.
.\"
-.Dd June 23, 2022
+.Dd February 14, 2024
.Dt ZPOOL-FEATURES 7
.Os
.
@@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
.Sy enabled
state when all datasets that use this feature are destroyed.
.
+.feature com.klarasystems fast_dedup yes
+This feature allows more advanced deduplication features to be enabled on new
+dedup tables.
+.Pp
+This feature will be
+.Sy active
+when the first deduplicated block is written after a new dedup table is created
+(ie after a new pool creation, or new checksum used on a dataset with
+.Sy dedup
+enabled).
+It will be returned to the
+.Sy enabled
+state when all deduplicated blocks using it are freed.
+.
.feature com.delphix extensible_dataset no
This feature allows more flexible use of internal ZFS data structures,
and exists for other features to depend on.
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 309d9bf14..8dec5f27b 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -754,6 +754,12 @@ zpool_feature_init(void)
"Support for raidz expansion",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
+ zfeature_register(SPA_FEATURE_FAST_DEDUP,
+ "com.klarasystems:fast_dedup", "fast_dedup",
+ "Support for advanced deduplication",
+ ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+ sfeatures);
+
zfs_mod_list_supported_free(sfeatures);
}
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index d70ae1a03..7e2010c42 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -39,6 +39,7 @@
#include
#include
#include
+#include
/*
* # DDT: Deduplication tables
@@ -185,6 +186,18 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
"unique",
};
+/*
+ * DDT feature flags automatically enabled for each on-disk version. Note that
+ * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
+ */
+static const uint64_t ddt_version_flags[] = {
+ [DDT_VERSION_LEGACY] = 0,
+ [DDT_VERSION_FDT] = 0,
+};
+
+/* Dummy version to signal that configure is still necessary */
+#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
+
static void
ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
@@ -196,14 +209,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ZCHECKSUM_FLAG_DEDUP;
char name[DDT_NAMELEN];
+ ASSERT3U(ddt->ddt_dir_object, >, 0);
+
ddt_object_name(ddt, type, class, name);
ASSERT3U(*objectp, ==, 0);
VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
ASSERT3U(*objectp, !=, 0);
- VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
- sizeof (uint64_t), 1, objectp, tx));
+ ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+
+ VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
+ objectp, tx));
VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
@@ -220,13 +237,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
uint64_t count;
char name[DDT_NAMELEN];
+ ASSERT3U(ddt->ddt_dir_object, >, 0);
+
ddt_object_name(ddt, type, class, name);
ASSERT3U(*objectp, !=, 0);
ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
VERIFY0(ddt_object_count(ddt, type, class, &count));
VERIFY0(count);
- VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+ VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
@@ -243,9 +262,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
char name[DDT_NAMELEN];
int error;
+ if (ddt->ddt_dir_object == 0) {
+ /*
+ * If we're configured but the containing dir doesn't exist
+ * yet, then this object can't possibly exist either.
+ */
+ ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+ return (SET_ERROR(ENOENT));
+ }
+
ddt_object_name(ddt, type, class, name);
- error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
if (error != 0)
return (error);
@@ -684,6 +712,8 @@ ddt_prefetch_all(spa_t *spa)
}
}
+static int ddt_configure(ddt_t *ddt, boolean_t new);
+
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
{
@@ -697,6 +727,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+ if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) {
+ /*
+ * This is the first use of this DDT since the pool was
+ * created; finish getting it ready for use.
+ */
+ VERIFY0(ddt_configure(ddt, B_TRUE));
+ ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+ }
+
ddt_key_fill(&search, bp);
/* Find an existing live entry */
@@ -837,6 +876,181 @@ ddt_key_compare(const void *x1, const void *x2)
return (TREE_ISIGN(cmp));
}
+/* Create the containing dir for this DDT and bump the feature count */
+static void
+ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
+{
+ ASSERT3U(ddt->ddt_dir_object, ==, 0);
+ ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
+
+ char name[DDT_NAMELEN];
+ snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+ zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+ ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
+
+ VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
+ sizeof (uint64_t), 1, &ddt->ddt_version, tx));
+ VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
+ sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
+
+ spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
+}
+
+/* Destroy the containing dir and deactivate the feature */
+static void
+ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
+{
+ ASSERT3U(ddt->ddt_dir_object, !=, 0);
+ ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
+ ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
+
+ char name[DDT_NAMELEN];
+ snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+ zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+ ASSERT(!ddt_object_exists(ddt, type, class));
+ }
+ }
+
+ uint64_t count;
+ ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
+ ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
+ DDT_DIR_VERSION));
+ ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
+ ASSERT3U(count, ==, 2);
+
+ VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+ VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
+
+ ddt->ddt_dir_object = 0;
+
+ spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
+}
+
+/*
+ * Determine, flags and on-disk layout from what's already stored. If there's
+ * nothing stored, then if new is false, returns ENOENT, and if true, selects
+ * based on pool config.
+ */
+static int
+ddt_configure(ddt_t *ddt, boolean_t new)
+{
+ spa_t *spa = ddt->ddt_spa;
+ char name[DDT_NAMELEN];
+ int error;
+
+ ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
+
+ boolean_t fdt_enabled =
+ spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
+ boolean_t fdt_active =
+ spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
+
+ /*
+ * First, look for the global DDT stats object. If its not there, then
+ * there's never been a DDT written before ever, and we know we're
+ * starting from scratch.
+ */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+ if (error != 0) {
+ if (error != ENOENT)
+ return (error);
+ goto not_found;
+ }
+
+ if (fdt_active) {
+ /*
+ * Now look for a DDT directory. If it exists, then it has
+ * everything we need.
+ */
+ snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+ zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
+ &ddt->ddt_dir_object);
+ if (error == 0) {
+ ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
+ DDT_DIR_VERSION, sizeof (uint64_t), 1,
+ &ddt->ddt_version);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
+ DDT_DIR_FLAGS, sizeof (uint64_t), 1,
+ &ddt->ddt_flags);
+ if (error != 0)
+ return (error);
+
+ if (ddt->ddt_version != DDT_VERSION_FDT) {
+ zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
+ "unknown version %llu", spa_name(spa),
+ name, (u_longlong_t)ddt->ddt_version);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
+ zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
+ "version=%llu unknown flags %llx",
+ spa_name(spa), name,
+ (u_longlong_t)ddt->ddt_flags,
+ (u_longlong_t)ddt->ddt_version);
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (0);
+ }
+ if (error != ENOENT)
+ return (error);
+ }
+
+ /* Any object in the root indicates a traditional setup. */
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_name(ddt, type, class, name);
+ uint64_t obj;
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
+ 1, &obj);
+ if (error == ENOENT)
+ continue;
+ if (error != 0)
+ return (error);
+
+ ddt->ddt_version = DDT_VERSION_LEGACY;
+ ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+ ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
+
+ return (0);
+ }
+ }
+
+not_found:
+ if (!new)
+ return (SET_ERROR(ENOENT));
+
+ /* Nothing on disk, so set up for the best version we can */
+ if (fdt_enabled) {
+ ddt->ddt_version = DDT_VERSION_FDT;
+ ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+ ddt->ddt_dir_object = 0; /* create on first use */
+ } else {
+ ddt->ddt_version = DDT_VERSION_LEGACY;
+ ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+ ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
+ }
+
+ return (0);
+}
+
static ddt_t *
ddt_table_alloc(spa_t *spa, enum zio_checksum c)
{
@@ -853,6 +1067,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
ddt->ddt_checksum = c;
ddt->ddt_spa = spa;
ddt->ddt_os = spa->spa_meta_objset;
+ ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
return (ddt);
}
@@ -889,7 +1104,6 @@ ddt_load(spa_t *spa)
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
&spa->spa_ddt_stat_object);
-
if (error)
return (error == ENOENT ? 0 : error);
@@ -898,6 +1112,12 @@ ddt_load(spa_t *spa)
continue;
ddt_t *ddt = spa->spa_ddt[c];
+ error = ddt_configure(ddt, B_FALSE);
+ if (error == ENOENT)
+ continue;
+ if (error != 0)
+ return (error);
+
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
@@ -912,10 +1132,11 @@ ddt_load(spa_t *spa)
*/
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
- spa->spa_dedup_dspace = ~0ULL;
- spa->spa_dedup_dsize = ~0ULL;
}
+ spa->spa_dedup_dspace = ~0ULL;
+ spa->spa_dedup_dsize = ~0ULL;
+
return (0);
}
@@ -1147,25 +1368,44 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
DMU_POOL_DDT_STATS, tx);
}
+ if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+ ddt_create_dir(ddt, tx);
+
while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
ddt_sync_entry(ddt, dde, tx, txg);
ddt_free(dde);
}
+ uint64_t count = 0;
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
- uint64_t add, count = 0;
+ uint64_t add, tcount = 0;
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
if (ddt_object_exists(ddt, type, class)) {
ddt_object_sync(ddt, type, class, tx);
VERIFY0(ddt_object_count(ddt, type, class,
&add));
- count += add;
+ tcount += add;
}
}
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
- if (count == 0 && ddt_object_exists(ddt, type, class))
+ if (tcount == 0 && ddt_object_exists(ddt, type, class))
ddt_object_destroy(ddt, type, class, tx);
}
+ count += tcount;
+ }
+
+ if (count == 0) {
+ /*
+ * No entries left on the DDT, so reset the version for next
+ * time. This allows us to handle the feature being changed
+ * since the DDT was originally created. New entries should get
+ * whatever the feature currently demands.
+ */
+ if (ddt->ddt_version == DDT_VERSION_FDT)
+ ddt_destroy_dir(ddt, tx);
+
+ ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+ ddt->ddt_flags = 0;
}
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index e12d5498c..c3bceabab 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -48,6 +48,10 @@ static unsigned long zio_decompress_fail_fraction = 0;
/*
* Compression vectors.
+ *
+ * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
+ * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
+ * PART OF THE ON-DISK FORMAT.
*/
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{"inherit", 0, NULL, NULL, NULL},
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index e8a94ce20..50c1b7a9d 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -109,5 +109,6 @@ if is_linux || is_freebsd; then
"feature@block_cloning"
"feature@vdev_zaps_v2"
"feature@raidz_expansion"
+ "feature@fast_dedup"
)
fi