diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 66d59ceba..02d0cf5da 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -39,6 +39,12 @@ extern "C" { struct abd; +/* + * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). + */ +/* No flags yet. */ +#define DDT_FLAG_MASK (0) + /* * DDT on-disk storage object types. Each one corresponds to specific * implementation, see ddt_ops_t. The value itself is not stored on disk. @@ -185,11 +191,15 @@ typedef struct { avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ - avl_tree_t ddt_repair_tree; /* entries being repaired */ + avl_tree_t ddt_repair_tree; /* entries being repaired */ - enum zio_checksum ddt_checksum; /* checksum algorithm in use */ - spa_t *ddt_spa; /* pool this ddt is on */ - objset_t *ddt_os; /* ddt objset (always MOS) */ + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ + + uint64_t ddt_dir_object; /* MOS dir holding ddt objects */ + uint64_t ddt_version; /* DDT version */ + uint64_t ddt_flags; /* FDT option flags */ /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 4aaab10c8..9c0fea64f 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -33,6 +33,14 @@ extern "C" { #endif +/* DDT version numbers */ +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Names of interesting objects in the DDT root dir */ +#define DDT_DIR_VERSION "version" +#define DDT_DIR_FLAGS "flags" + /* * Ops vector to access a specific DDT object type. */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1376cbef7..5b80dc315 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -376,6 +376,7 @@ typedef struct dmu_buf { #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_DDT_DIR "DDT-%s" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_ERRORSCRUB "error_scrub" diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2515ba321..5733a8187 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -82,6 +82,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_FAST_DEDUP, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 51c8dc964..88baa4168 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -616,7 +616,7 @@ - + @@ -6006,7 +6006,8 @@ - + + @@ -9131,8 +9132,8 @@ - - + + @@ -9209,7 +9210,7 @@ - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ea3c68dc6..ff6e485a4 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -17,8 +17,9 @@ .\" Copyright (c) 2019, Klara Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley +.\" Copyright (c) 2023, Klara Inc. .\" -.Dd June 23, 2022 +.Dd February 14, 2024 .Dt ZPOOL-FEATURES 7 .Os . @@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . +.feature com.klarasystems fast_dedup yes +This feature allows more advanced deduplication features to be enabled on new +dedup tables. +.Pp +This feature will be +.Sy active +when the first deduplicated block is written after a new dedup table is created +(ie after a new pool creation, or new checksum used on a dataset with +.Sy dedup +enabled). +It will be returned to the +.Sy enabled +state when all deduplicated blocks using it are freed. +. .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 309d9bf14..8dec5f27b 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -754,6 +754,12 @@ zpool_feature_init(void) "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_FAST_DEDUP, + "com.klarasystems:fast_dedup", "fast_dedup", + "Support for advanced deduplication", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index d70ae1a03..7e2010c42 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * # DDT: Deduplication tables @@ -185,6 +186,18 @@ static const char *const ddt_class_name[DDT_CLASSES] = { "unique", }; +/* + * DDT feature flags automatically enabled for each on-disk version. Note that + * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled. + */ +static const uint64_t ddt_version_flags[] = { + [DDT_VERSION_LEGACY] = 0, + [DDT_VERSION_FDT] = 0, +}; + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) @@ -196,14 +209,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, ==, 0); VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); ASSERT3U(*objectp, !=, 0); - VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + + VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, + objectp, tx)); VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), @@ -220,13 +237,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t count; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY0(ddt_object_count(ddt, type, class, &count)); VERIFY0(count); - VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx)); VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); @@ -243,9 +262,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) char name[DDT_NAMELEN]; int error; + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then this object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + return (SET_ERROR(ENOENT)); + } + ddt_object_name(ddt, type, class, name); - error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); if (error != 0) return (error); @@ -684,6 +712,8 @@ ddt_prefetch_all(spa_t *spa) } } +static int ddt_configure(ddt_t *ddt, boolean_t new); + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { @@ -697,6 +727,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) { + /* + * This is the first use of this DDT since the pool was + * created; finish getting it ready for use. + */ + VERIFY0(ddt_configure(ddt, B_TRUE)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + } + ddt_key_fill(&search, bp); /* Find an existing live entry */ @@ -837,6 +876,181 @@ ddt_key_compare(const void *x1, const void *x2) return (TREE_ISIGN(cmp)); } +/* Create the containing dir for this DDT and bump the feature count */ +static void +ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, ==, 0); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + ddt->ddt_dir_object = zap_create_link(ddt->ddt_os, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx); + + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, + sizeof (uint64_t), 1, &ddt->ddt_version, tx)); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, + sizeof (uint64_t), 1, &ddt->ddt_flags, tx)); + + spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* Destroy the containing dir and deactivate the feature */ +static void +ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, !=, 0); + ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ASSERT(!ddt_object_exists(ddt, type, class)); + } + } + + uint64_t count; + ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS)); + ASSERT3U(count, ==, 2); + + VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx)); + + ddt->ddt_dir_object = 0; + + spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* + * Determine, flags and on-disk layout from what's already stored. If there's + * nothing stored, then if new is false, returns ENOENT, and if true, selects + * based on pool config. + */ +static int +ddt_configure(ddt_t *ddt, boolean_t new) +{ + spa_t *spa = ddt->ddt_spa; + char name[DDT_NAMELEN]; + int error; + + ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE); + + boolean_t fdt_enabled = + spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP); + boolean_t fdt_active = + spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP); + + /* + * First, look for the global DDT stats object. If its not there, then + * there's never been a DDT written before ever, and we know we're + * starting from scratch. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + if (error != 0) { + if (error != ENOENT) + return (error); + goto not_found; + } + + if (fdt_active) { + /* + * Now look for a DDT directory. If it exists, then it has + * everything we need. + */ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, + &ddt->ddt_dir_object); + if (error == 0) { + ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION, sizeof (uint64_t), 1, + &ddt->ddt_version); + if (error != 0) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_FLAGS, sizeof (uint64_t), 1, + &ddt->ddt_flags); + if (error != 0) + return (error); + + if (ddt->ddt_version != DDT_VERSION_FDT) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "unknown version %llu", spa_name(spa), + name, (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "version=%llu unknown flags %llx", + spa_name(spa), name, + (u_longlong_t)ddt->ddt_flags, + (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + return (0); + } + if (error != ENOENT) + return (error); + } + + /* Any object in the root indicates a traditional setup. */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ddt_object_name(ddt, type, class, name); + uint64_t obj; + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), + 1, &obj); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + + return (0); + } + } + +not_found: + if (!new) + return (SET_ERROR(ENOENT)); + + /* Nothing on disk, so set up for the best version we can */ + if (fdt_enabled) { + ddt->ddt_version = DDT_VERSION_FDT; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = 0; /* create on first use */ + } else { + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + } + + return (0); +} + static ddt_t * ddt_table_alloc(spa_t *spa, enum zio_checksum c) { @@ -853,6 +1067,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; return (ddt); } @@ -889,7 +1104,6 @@ ddt_load(spa_t *spa) error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); - if (error) return (error == ENOENT ? 0 : error); @@ -898,6 +1112,12 @@ ddt_load(spa_t *spa) continue; ddt_t *ddt = spa->spa_ddt[c]; + error = ddt_configure(ddt, B_FALSE); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -912,10 +1132,11 @@ ddt_load(spa_t *spa) */ memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); - spa->spa_dedup_dspace = ~0ULL; - spa->spa_dedup_dsize = ~0ULL; } + spa->spa_dedup_dspace = ~0ULL; + spa->spa_dedup_dsize = ~0ULL; + return (0); } @@ -1147,25 +1368,44 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) DMU_POOL_DDT_STATS, tx); } + if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) + ddt_create_dir(ddt, tx); + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ddt_sync_entry(ddt, dde, tx, txg); ddt_free(dde); } + uint64_t count = 0; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - uint64_t add, count = 0; + uint64_t add, tcount = 0; for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); VERIFY0(ddt_object_count(ddt, type, class, &add)); - count += add; + tcount += add; } } for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (count == 0 && ddt_object_exists(ddt, type, class)) + if (tcount == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } + count += tcount; + } + + if (count == 0) { + /* + * No entries left on the DDT, so reset the version for next + * time. This allows us to handle the feature being changed + * since the DDT was originally created. New entries should get + * whatever the feature currently demands. + */ + if (ddt->ddt_version == DDT_VERSION_FDT) + ddt_destroy_dir(ddt, tx); + + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_flags = 0; } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index e12d5498c..c3bceabab 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -48,6 +48,10 @@ static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. + * + * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS. + * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE + * PART OF THE ON-DISK FORMAT. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"inherit", 0, NULL, NULL, NULL}, diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index e8a94ce20..50c1b7a9d 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -109,5 +109,6 @@ if is_linux || is_freebsd; then "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" + "feature@fast_dedup" ) fi