Add DDT prune command

Requires the new 'flat' physical data which has the start
time for a class entry.

The amount to prune can be based on a target percentage of
the unique entries or based on the age (i.e., every entry
older than N days).

Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #16277
This commit is contained in:
Don Brady 2024-06-17 22:35:18 +00:00 committed by Brian Behlendorf
parent 4a4f7b019f
commit d4d79451cb
21 changed files with 905 additions and 85 deletions

View File

@ -2045,7 +2045,7 @@ dump_all_ddts(spa_t *spa)
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
continue;
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
@ -2072,6 +2072,32 @@ dump_all_ddts(spa_t *spa)
}
dump_dedup_ratio(&dds_total);
/*
* Dump a histogram of unique class entry age
*/
if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
ddt_age_histo_t histogram;
(void) printf("DDT walk unique, building age histogram...\n");
ddt_prune_walk(spa, 0, &histogram);
/*
* print out histogram for unique entry class birth
*/
if (histogram.dah_entries > 0) {
(void) printf("%5s %9s %4s\n",
"age", "blocks", "amnt");
(void) printf("%5s %9s %4s\n",
"-----", "---------", "----");
for (int i = 0; i < HIST_BINS; i++) {
(void) printf("%5d %9d %4d%%\n", 1 << i,
(int)histogram.dah_age_histo[i],
(int)((histogram.dah_age_histo[i] * 100) /
histogram.dah_entries));
}
}
}
}
static void
@ -5749,12 +5775,17 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
ddt_entry_t *dde = ddt_lookup(ddt, bp);
/*
* ddt_lookup() can only return NULL if this block didn't exist
* ddt_lookup() can return NULL if this block didn't exist
* in the DDT and creating it would take the DDT over its
* quota. Since we got the block from disk, it must exist in
* the DDT, so this can't happen.
* the DDT, so this can't happen. However, when unique entries
* are pruned, the dedup bit can be set with no corresponding
* entry in the DDT.
*/
VERIFY3P(dde, !=, NULL);
if (dde == NULL) {
ddt_exit(ddt);
goto skipped;
}
/* Get the phys for this variant */
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
@ -5774,8 +5805,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
/* Consume a reference for this block. */
VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
ddt_phys_decref(dde->dde_phys, v);
if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
ddt_phys_decref(dde->dde_phys, v);
/*
* If this entry has a single flat phys, it may have been
@ -5864,6 +5895,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
}
}
skipped:
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
int t = (i & 1) ? type : ZDB_OT_TOTAL;
@ -8138,7 +8170,7 @@ dump_mos_leaks(spa_t *spa)
for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
continue;
/* DDT store objects */
@ -8150,11 +8182,14 @@ dump_mos_leaks(spa_t *spa)
}
/* FDT container */
mos_obj_refd(ddt->ddt_dir_object);
if (ddt->ddt_version == DDT_VERSION_FDT)
mos_obj_refd(ddt->ddt_dir_object);
/* FDT log objects */
mos_obj_refd(ddt->ddt_log[0].ddl_object);
mos_obj_refd(ddt->ddt_log[1].ddl_object);
if (ddt->ddt_flags & DDT_FLAG_LOG) {
mos_obj_refd(ddt->ddt_log[0].ddl_object);
mos_obj_refd(ddt->ddt_log[1].ddl_object);
}
}
if (spa->spa_brt != NULL) {

View File

@ -130,6 +130,8 @@ static int zpool_do_version(int, char **);
static int zpool_do_wait(int, char **);
static int zpool_do_ddt_prune(int, char **);
static int zpool_do_help(int argc, char **argv);
static zpool_compat_status_t zpool_do_load_compat(
@ -170,6 +172,7 @@ typedef enum {
HELP_CLEAR,
HELP_CREATE,
HELP_CHECKPOINT,
HELP_DDT_PRUNE,
HELP_DESTROY,
HELP_DETACH,
HELP_EXPORT,
@ -426,6 +429,8 @@ static zpool_command_t command_table[] = {
{ "sync", zpool_do_sync, HELP_SYNC },
{ NULL },
{ "wait", zpool_do_wait, HELP_WAIT },
{ NULL },
{ "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE },
};
#define NCOMMAND (ARRAY_SIZE(command_table))
@ -545,6 +550,8 @@ get_usage(zpool_help_t idx)
case HELP_WAIT:
return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
"<pool> [interval]\n"));
case HELP_DDT_PRUNE:
return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
default:
__builtin_unreachable();
}
@ -13342,6 +13349,88 @@ found:;
return (error);
}
/*
* zpool ddtprune -d|-p <amount> <pool>
*
* -d <days> Prune entries <days> old and older
* -p <percent> Prune <percent> amount of entries
*
* Prune single reference entries from DDT to satisfy the amount specified.
*/
int
zpool_do_ddt_prune(int argc, char **argv)
{
zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
uint64_t amount = 0;
zpool_handle_t *zhp;
char *endptr;
int c;
while ((c = getopt(argc, argv, "d:p:")) != -1) {
switch (c) {
case 'd':
if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
(void) fprintf(stderr, gettext("-d cannot be "
"combined with -p option\n"));
usage(B_FALSE);
}
errno = 0;
amount = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0' || amount == 0) {
(void) fprintf(stderr,
gettext("invalid days value\n"));
usage(B_FALSE);
}
amount *= 86400; /* convert days to seconds */
unit = ZPOOL_DDT_PRUNE_AGE;
break;
case 'p':
if (unit == ZPOOL_DDT_PRUNE_AGE) {
(void) fprintf(stderr, gettext("-p cannot be "
"combined with -d option\n"));
usage(B_FALSE);
}
errno = 0;
amount = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0' ||
amount == 0 || amount > 100) {
(void) fprintf(stderr,
gettext("invalid percentage value\n"));
usage(B_FALSE);
}
unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;
if (unit == ZPOOL_DDT_PRUNE_NONE) {
(void) fprintf(stderr,
gettext("missing amount option (-d|-p <value>)\n"));
usage(B_FALSE);
} else if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool argument\n"));
usage(B_FALSE);
} else if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
zhp = zpool_open(g_zfs, argv[0]);
if (zhp == NULL)
return (-1);
int error = zpool_ddt_prune(zhp, unit, amount);
zpool_close(zhp);
return (error);
}
static int
find_command_idx(const char *command, int *idx)
{

View File

@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
extern uint64_t raidz_expand_max_reflow_bytes;
extern uint_t raidz_expand_pause_point;
extern boolean_t ddt_prune_artificial_age;
extern boolean_t ddt_dump_prune_histogram;
static ztest_shared_opts_t *ztest_shared_opts;
@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
ztest_func_t ztest_pool_prefetch_ddt;
ztest_func_t ztest_ddt_prune;
static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@ -7288,6 +7292,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
mutex_exit(&ztest_vdev_lock);
}
void
ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
{
(void) zd, (void) id;
spa_t *spa = ztest_spa;
uint64_t pct = ztest_random(15) + 1;
(void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
}
/*
* Verify pool integrity by running zdb.
*/
@ -7469,6 +7484,13 @@ ztest_resume_thread(void *arg)
{
spa_t *spa = arg;
/*
* Synthesize aged DDT entries for ddt prune testing
*/
ddt_prune_artificial_age = B_TRUE;
if (ztest_opts.zo_verbose >= 3)
ddt_dump_prune_histogram = B_TRUE;
while (!ztest_exiting) {
if (spa_suspended(spa))
ztest_resume(spa);
@ -8587,6 +8609,12 @@ ztest_init(ztest_shared_t *zs)
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
continue;
/*
* split 50/50 between legacy and fast dedup
*/
if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
continue;
VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
spa_feature_table[i].fi_uname));
fnvlist_add_uint64(props, buf, 0);

View File

@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
usr/share/man/man8/zpool-create.8
usr/share/man/man8/zpool-destroy.8
usr/share/man/man8/zpool-detach.8
usr/share/man/man8/zpool-ddtprune.8
usr/share/man/man8/zpool-events.8
usr/share/man/man8/zpool-export.8
usr/share/man/man8/zpool-get.8

View File

@ -305,6 +305,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
uint64_t);
_LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
_LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);

View File

@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
uint64_t);
#ifdef __cplusplus
}
#endif

View File

@ -405,6 +405,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
uint64_t amount);
#ifdef __cplusplus
}
#endif

View File

@ -35,8 +35,11 @@ extern "C" {
#endif
/* DDT version numbers */
#define DDT_VERSION_LEGACY (0)
#define DDT_VERSION_FDT (1)
#define DDT_VERSION_LEGACY (0)
#define DDT_VERSION_FDT (1)
/* Dummy version to signal that configure is still necessary */
#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
/* Names of interesting objects in the DDT root dir */
#define DDT_DIR_VERSION "version"
@ -187,8 +190,11 @@ extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk);
extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
dmu_tx_t *tx);
@ -211,6 +217,44 @@ extern void ddt_log_fini(void);
* them up.
*/
/*
* We use a histogram to convert a percentage request into a
* cutoff value where entries older than the cutoff get pruned.
*
* The histogram bins represent hours in power-of-two increments.
* 16 bins covers up to four years.
*/
#define HIST_BINS 16
typedef struct ddt_age_histo {
uint64_t dah_entries;
uint64_t dah_age_histo[HIST_BINS];
} ddt_age_histo_t;
void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);
#if defined(_KERNEL) || !defined(ZFS_DEBUG)
#define ddt_dump_age_histogram(histo, cutoff) ((void)0)
#else
static inline void
ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
{
if (histogram->dah_entries == 0)
return;
(void) printf("DDT prune unique class age, %llu hour cutoff\n",
(u_longlong_t)(gethrestime_sec() - cutoff)/3600);
(void) printf("%5s %9s %4s\n", "age", "blocks", "amnt");
(void) printf("%5s %9s %4s\n", "-----", "---------", "----");
for (int i = 0; i < HIST_BINS; i++) {
(void) printf("%5d %9llu %4d%%\n", 1<<i,
(u_longlong_t)histogram->dah_age_histo[i],
(int)((histogram->dah_age_histo[i] * 100) /
histogram->dah_entries));
}
}
#endif
/*
* Enough room to expand DMU_POOL_DDT format for all possible DDT
* checksum/class/type combinations.

View File

@ -1422,7 +1422,7 @@ typedef enum {
*/
typedef enum zfs_ioc {
/*
* Core features - 88/128 numbers reserved.
* Core features - 89/128 numbers reserved.
*/
#ifdef __FreeBSD__
ZFS_IOC_FIRST = 0,
@ -1519,6 +1519,7 @@ typedef enum zfs_ioc {
ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */
ZFS_IOC_POOL_SCRUB, /* 0x5a57 */
ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */
ZFS_IOC_DDT_PRUNE, /* 0x5a59 */
/*
* Per-platform (Optional) - 8/128 numbers reserved.
@ -1655,6 +1656,12 @@ typedef enum {
ZPOOL_PREFETCH_DDT
} zpool_prefetch_type_t;
typedef enum {
ZPOOL_DDT_PRUNE_NONE,
ZPOOL_DDT_PRUNE_AGE, /* in seconds */
ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */
} zpool_ddt_prune_unit_t;
/*
* Bookmark name values.
*/
@ -1753,6 +1760,12 @@ typedef enum {
*/
#define ZPOOL_PREFETCH_TYPE "prefetch_type"
/*
* The following are names used when invoking ZFS_IOC_DDT_PRUNE.
*/
#define DDT_PRUNE_UNIT "ddt_prune_unit"
#define DDT_PRUNE_AMOUNT "ddt_prune_amount"
/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/

View File

@ -412,6 +412,7 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */

View File

@ -183,8 +183,8 @@
<elf-symbol name='fsleep' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_dataset_depth' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_system_hostid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='get_timestamp' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getexecname' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getextmntent' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getmntany' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='getprop_uint64' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -466,7 +466,9 @@
<elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_default_search_paths' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_disable_datasets' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -485,8 +487,8 @@
<elf-symbol name='zpool_export_force' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_feature_init' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_config' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_parent_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_find_vdev_by_physpath' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_free_handles' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_get_all_vdev_props' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -529,7 +531,6 @@
<elf-symbol name='zpool_prefetch' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prepare_and_label_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prepare_disk' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_align_right' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_column_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_prop_default_numeric' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -5929,6 +5930,7 @@
<enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
<enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
<enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
<enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
<enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
<enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
<enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@ -5963,6 +5965,13 @@
<enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
</enum-decl>
<typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
<enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
<enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
<enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
</enum-decl>
<typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
<enum-decl name='spa_feature' id='33ecb627'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='SPA_FEATURE_NONE' value='-1'/>
@ -6139,6 +6148,12 @@
<parameter type-id='857bb57e'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='02e25ab0'/>
<parameter type-id='9c313c2d'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zfs_resolve_shortname' mangled-name='zfs_resolve_shortname' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_resolve_shortname'>
<parameter type-id='80f4b756'/>
<parameter type-id='26a90f95'/>
@ -6798,6 +6813,12 @@
<parameter type-id='80f4b756' name='propval'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_ddt_prune' mangled-name='zpool_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_ddt_prune'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='02e25ab0' name='unit'/>
<parameter type-id='9c313c2d' name='amount'/>
<return type-id='95e97e5e'/>
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='lib/libzfs/libzfs_sendrecv.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='8901473c' size-in-bits='576' id='f5da478b'>
@ -7837,7 +7858,7 @@
</data-member>
</class-decl>
<typedef-decl name='vdev_cbdata_t' type-id='b8006be8' id='a9679c94'/>
<class-decl name='zprop_get_cbdata' size-in-bits='832' is-struct='yes' visibility='default' id='f3d3c319'>
<class-decl name='zprop_get_cbdata' size-in-bits='960' is-struct='yes' visibility='default' id='f3d3c319'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='cb_sources' type-id='95e97e5e' visibility='default'/>
</data-member>
@ -7856,6 +7877,9 @@
<data-member access='public' layout-offset-in-bits='448'>
<var-decl name='cb_first' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='480'>
<var-decl name='cb_json' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='512'>
<var-decl name='cb_proplist' type-id='3a9b2288' visibility='default'/>
</data-member>
@ -7865,6 +7889,15 @@
<data-member access='public' layout-offset-in-bits='640'>
<var-decl name='cb_vdevs' type-id='a9679c94' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='832'>
<var-decl name='cb_jsobj' type-id='5ce45b60' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='896'>
<var-decl name='cb_json_as_int' type-id='c19b74c3' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='928'>
<var-decl name='cb_json_pool_key_guid' type-id='c19b74c3' visibility='default'/>
</data-member>
</class-decl>
<typedef-decl name='zprop_get_cbdata_t' type-id='f3d3c319' id='f3d87113'/>
<typedef-decl name='zprop_func' type-id='2e711a2a' id='1ec3747a'/>
@ -7968,6 +8001,11 @@
<qualified-type-def type-id='d33f11cb' restrict='yes' id='5c53ba29'/>
<pointer-type-def type-id='ffa52b96' size-in-bits='64' id='76c8174b'/>
<pointer-type-def type-id='f3d87113' size-in-bits='64' id='0d2a0670'/>
<function-decl name='nvlist_print_json' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='822cd80b'/>
<parameter type-id='5ce45b60'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_label_disk' mangled-name='zpool_label_disk' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_label_disk'>
<parameter type-id='b0382bb3'/>
<parameter type-id='4c81de99'/>
@ -8075,6 +8113,11 @@
<parameter type-id='d33f11cb'/>
<return type-id='48b5725f'/>
</function-decl>
<function-decl name='putc' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='95e97e5e'/>
<parameter type-id='822cd80b'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='puts' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='95e97e5e'/>
@ -8093,6 +8136,11 @@
<parameter type-id='95e97e5e'/>
<return type-id='48b5725f'/>
</function-decl>
<function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='80f4b756'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='strnlen' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='b59d7dce'/>
@ -8292,12 +8340,12 @@
<function-decl name='zfs_version_print' mangled-name='zfs_version_print' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_print'>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zfs_version_nvlist' mangled-name='zfs_version_nvlist' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_version_nvlist'>
<return type-id='5ce45b60'/>
</function-decl>
<function-decl name='use_color' mangled-name='use_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='use_color'>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='printf_color' mangled-name='printf_color' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='printf_color'>
<parameter type-id='80f4b756' name='color'/>
<parameter type-id='80f4b756' name='format'/>
@ -8802,11 +8850,6 @@
<parameter type-id='78c01427'/>
<return type-id='13956559'/>
</function-decl>
<function-decl name='strspn' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='80f4b756'/>
<return type-id='b59d7dce'/>
</function-decl>
<function-decl name='zfs_dirnamelen' mangled-name='zfs_dirnamelen' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_dirnamelen'>
<parameter type-id='80f4b756' name='path'/>
<return type-id='79a0948f'/>

View File

@ -5649,3 +5649,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
return (ret);
}
/*
* Prune older entries from the DDT to reclaim space under the quota
*/
int
zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit,
uint64_t amount)
{
int error = lzc_ddt_prune(zhp->zpool_name, unit, amount);
if (error != 0) {
libzfs_handle_t *hdl = zhp->zpool_hdl;
char errbuf[ERRBUFLEN];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot prune dedup table on '%s'"), zhp->zpool_name);
if (error == EALREADY) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"a prune operation is already in progress"));
(void) zfs_error(hdl, EZFS_BUSY, errbuf);
} else {
(void) zpool_standard_error(hdl, errno, errbuf);
}
return (-1);
}
return (0);
}

View File

@ -162,6 +162,7 @@
<elf-symbol name='lzc_channel_program_nosync' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_clone' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy_bookmarks' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='lzc_destroy_snaps' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -1444,6 +1445,7 @@
<enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
<enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
<enumerator name='ZFS_IOC_POOL_PREFETCH' value='23128'/>
<enumerator name='ZFS_IOC_DDT_PRUNE' value='23129'/>
<enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
<enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
<enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
@ -1484,6 +1486,13 @@
<enumerator name='ZPOOL_PREFETCH_DDT' value='1'/>
</enum-decl>
<typedef-decl name='zpool_prefetch_type_t' type-id='0299ab50' id='e55ff6bc'/>
<enum-decl name='zpool_ddt_prune_unit_t' naming-typedef-id='02e25ab0' id='509ae11c'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='ZPOOL_DDT_PRUNE_NONE' value='0'/>
<enumerator name='ZPOOL_DDT_PRUNE_AGE' value='1'/>
<enumerator name='ZPOOL_DDT_PRUNE_PERCENTAGE' value='2'/>
</enum-decl>
<typedef-decl name='zpool_ddt_prune_unit_t' type-id='509ae11c' id='02e25ab0'/>
<enum-decl name='data_type_t' naming-typedef-id='8d0687d2' id='aeeae136'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='DATA_TYPE_DONTCARE' value='-1'/>
@ -3015,6 +3024,12 @@
<parameter type-id='857bb57e' name='outnvl'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='lzc_ddt_prune' mangled-name='lzc_ddt_prune' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='lzc_ddt_prune'>
<parameter type-id='80f4b756' name='pool'/>
<parameter type-id='02e25ab0' name='unit'/>
<parameter type-id='9c313c2d' name='amount'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-type size-in-bits='64' id='c70fa2e8'>
<parameter type-id='95e97e5e'/>
<parameter type-id='eaa32e2f'/>

View File

@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl)
{
return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl));
}
/*
* Prune the specified amount from the pool's dedup table.
*/
int
lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount)
{
int error;
nvlist_t *result = NULL;
nvlist_t *args = fnvlist_alloc();
fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit);
fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount);
error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result);
fnvlist_free(args);
fnvlist_free(result);
return (error);
}

View File

@ -72,6 +72,7 @@ dist_man_MANS = \
%D%/man8/zpool-create.8 \
%D%/man8/zpool-destroy.8 \
%D%/man8/zpool-detach.8 \
%D%/man8/zpool-ddtprune.8 \
%D%/man8/zpool-events.8 \
%D%/man8/zpool-export.8 \
%D%/man8/zpool-get.8 \

48
man/man8/zpool-ddtprune.8 Normal file
View File

@ -0,0 +1,48 @@
.\"
.\" CDDL HEADER START
.\"
.\" The contents of this file are subject to the terms of the
.\" Common Development and Distribution License (the "License").
.\" You may not use this file except in compliance with the License.
.\"
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
.\" or http://www.opensolaris.org/os/licensing.
.\" See the License for the specific language governing permissions
.\" and limitations under the License.
.\"
.\" When distributing Covered Code, include this CDDL HEADER in each
.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
.\" If applicable, add the following below this CDDL HEADER, with the
.\" fields enclosed by brackets "[]" replaced with your own identifying
.\" information: Portions Copyright [yyyy] [name of copyright owner]
.\"
.\" CDDL HEADER END
.\"
.\"
.\" Copyright (c) 2024, Klara Inc.
.\"
.Dd June 17, 2024
.Dt ZPOOL-DDTPRUNE 8
.Os
.
.Sh NAME
.Nm zpool-ddtprune
.Nd Prunes the oldest entries from the single reference dedup table(s)
.Sh SYNOPSIS
.Nm zpool
.Cm ddtprune
.Fl d Ar days | Fl p Ar percentage
.Ar pool
.Sh DESCRIPTION
This command prunes older unique entries from the dedup table.
As a complement to the dedup quota feature,
.Sy ddtprune
allows removal of older non-duplicate entries to make room for
newer duplicate entries.
.Pp
The amount to prune can be based on a target percentage of the unique entries
or based on the age (i.e., every unique entry older than N days).
.
.Sh SEE ALSO
.Xr zdb 8 ,
.Xr zpool-status 8

View File

@ -592,6 +592,7 @@ don't wait.
.Xr zpool-checkpoint 8 ,
.Xr zpool-clear 8 ,
.Xr zpool-create 8 ,
.Xr zpool-ddtprune 8 ,
.Xr zpool-destroy 8 ,
.Xr zpool-detach 8 ,
.Xr zpool-events 8 ,

View File

@ -125,6 +125,13 @@
* without which, no space would be recovered and the DDT would continue to be
* considered "over quota". See zap_shrink_enabled.
*
* ## Dedup table pruning
*
* As a complement to the dedup quota feature, ddtprune allows removal of older
* non-duplicate entries to make room for newer duplicate entries. The amount
* to prune can be based on a target percentage of the unique entries or based
* on the age (i.e., prune unique entry older than N days).
*
* ## Dedup log
*
* Historically, all entries modified on a txg were written back to dedup
@ -228,6 +235,19 @@ int zfs_dedup_prefetch = 0;
*/
uint_t dedup_class_wait_txgs = 5;
/*
* How many DDT prune entries to add to the DDT sync AVL tree.
* Note these addtional entries have a memory footprint of a
* ddt_entry_t (216 bytes).
*/
static uint32_t zfs_ddt_prunes_per_txg = 50000;
/*
* For testing, synthesize aged DDT entries
* (in global scope for ztest)
*/
boolean_t ddt_prune_artificial_age = B_FALSE;
boolean_t ddt_dump_prune_histogram = B_FALSE;
/*
* Don't do more than this many incremental flush passes per txg.
@ -268,10 +288,6 @@ static const uint64_t ddt_version_flags[] = {
[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
};
/* Dummy version to signal that configure is still necessary */
#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)
#ifdef _KERNEL
/* per-DDT kstats */
typedef struct {
/* total lookups and whether they returned new or existing entries */
@ -324,6 +340,7 @@ static const ddt_kstats_t ddt_kstats_template = {
{ "log_flush_time_rate", KSTAT_DATA_UINT32 },
};
#ifdef _KERNEL
#define _DDT_KSTAT_STAT(ddt, stat) \
&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
#define DDT_KSTAT_BUMP(ddt, stat) \
@ -343,6 +360,7 @@ static const ddt_kstats_t ddt_kstats_template = {
#define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
#endif /* _KERNEL */
static void
ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
@ -715,6 +733,30 @@ ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
}
static uint64_t
ddt_class_start(void)
{
uint64_t start = gethrestime_sec();
if (ddt_prune_artificial_age) {
/*
* debug aide -- simulate a wider distribution
* so we don't have to wait for an aged DDT
* to test prune.
*/
int range = 1 << 21;
int percent = random_in_range(100);
if (percent < 50) {
range = range >> 4;
} else if (percent > 75) {
range /= 2;
}
start -= random_in_range(range);
}
return (start);
}
void
ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
{
@ -1022,6 +1064,47 @@ ddt_prefetch_all(spa_t *spa)
static int ddt_configure(ddt_t *ddt, boolean_t new);
/*
* If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
* to the ones in the entry. If they're different, then the passed-in BP is
* from a previous generation of this entry (ie was previously pruned) and we
* have to act like the entry doesn't exist at all.
*
* This should only happen during a lookup to free the block (zio_ddt_free()).
*
* XXX this is similar in spirit to ddt_phys_select(), maybe can combine
* -- robn, 2024-02-09
*/
static boolean_t
ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
{
/* If the BP has no DVAs, then this entry is good */
uint_t ndvas = BP_GET_NDVAS(bp);
if (ndvas == 0)
return (B_TRUE);
/*
* Only checking the phys for the copies. For flat, there's only one;
* for trad it'll be the one that has the matching set of DVAs.
*/
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
dde->dde_phys->ddp_flat.ddp_dva :
dde->dde_phys->ddp_trad[ndvas].ddp_dva;
/*
* Compare entry DVAs with the BP. They should all be there, but
* there's not really anything we can do if its only partial anyway,
* that's an error somewhere else, maybe long ago.
*/
uint_t d;
for (d = 0; d < ndvas; d++)
if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
return (B_FALSE);
ASSERT3U(d, ==, ndvas);
return (B_TRUE);
}
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
{
@ -1057,8 +1140,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
/* If it's already loaded, we can just return it. */
DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
if (dde->dde_flags & DDE_FLAG_LOADED)
return (dde);
if (dde->dde_flags & DDE_FLAG_LOADED) {
if (ddt_entry_lookup_is_valid(ddt, bp, dde))
return (dde);
return (NULL);
}
/* Someone else is loading it, wait for it. */
dde->dde_waiters++;
@ -1077,7 +1163,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
}
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
return (dde);
/* Make sure the loaded entry matches the BP */
if (ddt_entry_lookup_is_valid(ddt, bp, dde))
return (dde);
return (NULL);
} else
DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
@ -1086,32 +1176,42 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
/* Record the time this class was created (used by ddt prune) */
if (ddt->ddt_flags & DDT_FLAG_FLAT)
dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
avl_insert(&ddt->ddt_tree, dde, where);
/* If its in the log tree, we can "load" it from there */
if (ddt->ddt_flags & DDT_FLAG_LOG) {
ddt_lightweight_entry_t ddlwe;
boolean_t found = B_FALSE;
if (ddt_log_take_key(ddt, ddt->ddt_log_active,
&search, &ddlwe)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
found = B_TRUE;
} else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing,
&search, &ddlwe)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit);
found = B_TRUE;
}
if (found) {
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
if (ddt_log_find_key(ddt, &search, &ddlwe)) {
/*
* See if we have the key first, and if so, set up
* the entry.
*/
dde->dde_type = ddlwe.ddlwe_type;
dde->dde_class = ddlwe.ddlwe_class;
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
DDT_PHYS_SIZE(ddt));
/* Whatever we found isn't valid for this BP, eject */
if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(ddt, dde);
return (NULL);
}
/* Remove it and count it */
if (ddt_log_remove_key(ddt,
ddt->ddt_log_active, &search)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
} else {
VERIFY(ddt_log_remove_key(ddt,
ddt->ddt_log_flushing, &search));
DDT_KSTAT_BUMP(ddt,
dds_lookup_log_flushing_hit);
}
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
@ -1150,6 +1250,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
boolean_t valid = B_TRUE;
if (dde->dde_type == DDT_TYPES &&
dde->dde_class == DDT_CLASSES &&
ddt_over_quota(spa)) {
@ -1163,6 +1265,24 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
/* Flag cleanup required */
dde->dde_flags |= DDE_FLAG_OVERQUOTA;
} else if (error == 0) {
/*
* If what we loaded is no good for this BP and there's no one
* waiting for it, we can just remove it and get out. If its no
* good but there are waiters, we have to leave it, because we
* don't know what they want. If its not needed we'll end up
* taking an entry log/sync, but it can only happen if more
* than one previous version of this block is being deleted at
* the same time. This is extremely unlikely to happen and not
* worth the effort to deal with without taking an entry
* update.
*/
valid = ddt_entry_lookup_is_valid(ddt, bp, dde);
if (!valid && dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(ddt, dde);
return (NULL);
}
DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
@ -1191,7 +1311,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
dde->dde_flags |= DDE_FLAG_LOADED;
cv_broadcast(&dde->dde_cv);
return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde);
if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
return (NULL);
return (dde);
}
void
@ -1420,7 +1543,6 @@ not_found:
static void
ddt_table_alloc_kstats(ddt_t *ddt)
{
#ifdef _KERNEL
char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
char *name = kmem_asprintf("ddt_stats_%s",
zio_checksum_table[ddt->ddt_checksum].ci_name);
@ -1436,9 +1558,6 @@ ddt_table_alloc_kstats(ddt_t *ddt)
kmem_strfree(name);
kmem_strfree(mod);
#else
(void) ddt;
#endif /* _KERNEL */
}
static ddt_t *
@ -1468,13 +1587,11 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
static void
ddt_table_free(ddt_t *ddt)
{
#ifdef _KERNEL
if (ddt->ddt_ksp != NULL) {
kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
ddt->ddt_ksp->ks_data = NULL;
kstat_delete(ddt->ddt_ksp);
}
#endif /* _KERNEL */
ddt_log_free(ddt);
ASSERT0(avl_numnodes(&ddt->ddt_tree));
@ -1814,7 +1931,7 @@ ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
if (ddt_phys_birth(ddp, v) == 0) {
ASSERT3U(phys_refcnt, ==, 0);
ASSERT0(phys_refcnt);
continue;
}
if (DDT_PHYS_IS_DITTO(ddt, p)) {
@ -2288,8 +2405,9 @@ ddt_walk_ready(spa_t *spa)
return (B_TRUE);
}
int
ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
static int
ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
uint64_t flags, boolean_t wait)
{
do {
do {
@ -2298,7 +2416,11 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
if (ddt == NULL)
continue;
if (ddt->ddt_flush_force_txg > 0)
if (flags != 0 &&
(ddt->ddt_flags & flags) != flags)
continue;
if (wait && ddt->ddt_flush_force_txg > 0)
return (EAGAIN);
int error = ENOENT;
@ -2322,13 +2444,19 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
return (SET_ERROR(ENOENT));
}
int
ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
{
return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
}
/*
* This function is used by Block Cloning (brt.c) to increase reference
* counter for the DDT entry if the block is already in DDT.
*
* Return false if the block, despite having the D bit set, is not present
* in the DDT. Currently this is not possible but might be in the future.
* See the comment below.
* in the DDT. This is possible when the DDT has been pruned by an admin
* or by the DDT quota mechanism.
*/
boolean_t
ddt_addref(spa_t *spa, const blkptr_t *bp)
@ -2359,28 +2487,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
/*
* This entry already existed (dde_type is real), so it must
* have refcnt >0 at the start of this txg. We are called from
* brt_pending_apply(), before frees are issued, so the refcnt
* can't be lowered yet. Therefore, it must be >0. We assert
* this because if the order of BRT and DDT interactions were
* ever to change and the refcnt was ever zero here, then
* likely further action is required to fill out the DDT entry,
* and this is a place that is likely to be missed in testing.
*/
ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
ddt_phys_addref(dde->dde_phys, v);
result = B_TRUE;
} else {
/*
* At the time of implementating this if the block has the
* DEDUP flag set it must exist in the DEDUP table, but
* there are many advocates that want ability to remove
* entries from DDT with refcnt=1. If this will happen,
* we may have a block with the DEDUP set, but which doesn't
* have a corresponding entry in the DDT. Be ready.
* If the block has the DEDUP flag set it still might not
* exist in the DEDUP table due to DDT pruning of entries
* where refcnt=1.
*/
ddt_remove(ddt, dde);
result = B_FALSE;
@ -2392,6 +2505,261 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
return (result);
}
typedef struct ddt_prune_entry {
ddt_t *dpe_ddt;
ddt_key_t dpe_key;
list_node_t dpe_node;
ddt_univ_phys_t dpe_phys[];
} ddt_prune_entry_t;
typedef struct ddt_prune_info {
spa_t *dpi_spa;
uint64_t dpi_txg_syncs;
uint64_t dpi_pruned;
list_t dpi_candidates;
} ddt_prune_info_t;
/*
* Add prune candidates for ddt_sync during spa_sync
*/
static void
prune_candidates_sync(void *arg, dmu_tx_t *tx)
{
(void) tx;
ddt_prune_info_t *dpi = arg;
ddt_prune_entry_t *dpe;
spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
/* Process the prune candidates collected so far */
while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
blkptr_t blk;
ddt_t *ddt = dpe->dpe_ddt;
ddt_enter(ddt);
/*
* If it's on the live list, then it was loaded for update
* this txg and is no longer stale; skip it.
*/
if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
ddt_exit(ddt);
kmem_free(dpe, sizeof (*dpe));
continue;
}
ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
ddt_entry_t *dde = ddt_lookup(ddt, &blk);
if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
/*
* Zero the physical, so we don't try to free DVAs
* at flush nor try to reuse this entry.
*/
ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
dpi->dpi_pruned++;
}
ddt_exit(ddt);
kmem_free(dpe, sizeof (*dpe));
}
spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
dpi->dpi_txg_syncs++;
}
/*
* Prune candidates are collected in open context and processed
* in sync context as part of ddt_sync_table().
*/
static void
ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
const ddt_univ_phys_t *ddp)
{
ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
dpe->dpe_ddt = ddt;
dpe->dpe_key = *ddk;
memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
list_insert_head(list, dpe);
}
/*
* Interate over all the entries in the DDT unique class.
* The walk will perform one of the following operations:
* (a) build a histogram than can be used when pruning
* (b) prune entries older than the cutoff
*
* Also called by zdb(8) to dump the age histogram
*/
void
ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
{
ddt_bookmark_t ddb = {
.ddb_class = DDT_CLASS_UNIQUE,
.ddb_type = 0,
.ddb_checksum = 0,
.ddb_cursor = 0
};
ddt_lightweight_entry_t ddlwe = {0};
int error;
int total = 0, valid = 0;
int candidates = 0;
uint64_t now = gethrestime_sec();
ddt_prune_info_t dpi;
boolean_t pruning = (cutoff != 0);
if (pruning) {
dpi.dpi_txg_syncs = 0;
dpi.dpi_pruned = 0;
dpi.dpi_spa = spa;
list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
offsetof(ddt_prune_entry_t, dpe_node));
}
if (histogram != NULL)
memset(histogram, 0, sizeof (ddt_age_histo_t));
while ((error =
ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
VERIFY(ddt);
if (spa_shutting_down(spa) || issig())
break;
total++;
ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
uint64_t class_start =
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
/*
* If this entry is on the log, then the stored entry is stale
* and we should skip it.
*/
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
continue;
/* prune older entries */
if (pruning && class_start < cutoff) {
if (candidates++ >= zfs_ddt_prunes_per_txg) {
/* sync prune candidates in batches */
VERIFY0(dsl_sync_task(spa_name(spa),
NULL, prune_candidates_sync,
&dpi, 0, ZFS_SPACE_CHECK_NONE));
candidates = 1;
}
ddt_prune_entry(&dpi.dpi_candidates, ddt,
&ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
}
/* build a histogram */
if (histogram != NULL) {
uint64_t age = MAX(1, (now - class_start) / 3600);
int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
histogram->dah_entries++;
histogram->dah_age_histo[bin]++;
}
valid++;
}
if (pruning && valid > 0) {
if (!list_is_empty(&dpi.dpi_candidates)) {
/* sync out final batch of prune candidates */
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
prune_candidates_sync, &dpi, 0,
ZFS_SPACE_CHECK_NONE));
}
list_destroy(&dpi.dpi_candidates);
zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
(u_longlong_t)dpi.dpi_pruned,
(int)((dpi.dpi_pruned * 100) / valid),
(u_longlong_t)dpi.dpi_txg_syncs);
}
}
static uint64_t
ddt_total_entries(spa_t *spa)
{
ddt_object_t ddo;
ddt_get_dedup_object_stats(spa, &ddo);
return (ddo.ddo_count);
}
int
ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
uint64_t amount)
{
uint64_t cutoff;
uint64_t start_time = gethrtime();
if (spa->spa_active_ddt_prune)
return (SET_ERROR(EALREADY));
if (ddt_total_entries(spa) == 0)
return (0);
spa->spa_active_ddt_prune = B_TRUE;
zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
ddt_age_histo_t histogram;
uint64_t oldest = 0;
/* Make a pass over DDT to build a histogram */
ddt_prune_walk(spa, 0, &histogram);
int target = (histogram.dah_entries * amount) / 100;
/*
* Figure out our cutoff date
* (i.e., which bins to prune from)
*/
for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
if (histogram.dah_age_histo[i] != 0) {
/* less than this bucket remaining */
if (target < histogram.dah_age_histo[i]) {
oldest = MAX(1, (1<<i) * 3600);
target = 0;
} else {
target -= histogram.dah_age_histo[i];
}
}
}
cutoff = gethrestime_sec() - oldest;
if (ddt_dump_prune_histogram)
ddt_dump_age_histogram(&histogram, cutoff);
} else if (unit == ZPOOL_DDT_PRUNE_AGE) {
cutoff = gethrestime_sec() - amount;
} else {
return (EINVAL);
}
if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
/* Traverse DDT to prune entries older that our cuttoff */
ddt_prune_walk(spa, cutoff, NULL);
}
zfs_dbgmsg("%s: prune completed in %llu ms",
spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
spa->spa_active_ddt_prune = B_FALSE;
return (0);
}
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
"Enable prefetching dedup-ed blks");

View File

@ -353,16 +353,15 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
}
boolean_t
ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe)
ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
{
ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
if (ddle == NULL)
return (B_FALSE);
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
ddt_lightweight_entry_t ddlwe;
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
@ -371,6 +370,21 @@ ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
return (B_TRUE);
}
boolean_t
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe)
{
ddt_log_entry_t *ddle =
avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
if (!ddle)
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
if (!ddle)
return (B_FALSE);
if (ddlwe)
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
return (B_TRUE);
}
void
ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
{

View File

@ -4342,6 +4342,51 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
return (total_errors > 0 ? SET_ERROR(EINVAL) : 0);
}
#define DDT_PRUNE_UNIT "ddt_prune_unit"
#define DDT_PRUNE_AMOUNT "ddt_prune_amount"
/*
* innvl: {
* "ddt_prune_unit" -> uint32_t
* "ddt_prune_amount" -> uint64_t
* }
*
* outnvl: "waited" -> boolean_t
*/
static const zfs_ioc_key_t zfs_keys_ddt_prune[] = {
{DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0},
{DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0},
};
static int
zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
int32_t unit;
uint64_t amount;
if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 ||
nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) {
return (EINVAL);
}
spa_t *spa;
int error = spa_open(poolname, &spa, FTAG);
if (error != 0)
return (error);
if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit,
amount);
spa_close(spa, FTAG);
return (error);
}
/*
* This ioctl waits for activity of a particular type to complete. If there is
* no activity of that type in progress, it returns immediately, and the
@ -7430,6 +7475,11 @@ zfs_ioctl_init(void)
POOL_CHECK_NONE, B_FALSE, B_FALSE,
zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props));
zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE,
zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune));
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,

View File

@ -3859,6 +3859,16 @@ zio_ddt_free(zio_t *zio)
}
ddt_exit(ddt);
/*
* When no entry was found, it must have been pruned,
* so we can free it now instead of decrementing the
* refcount in the DDT.
*/
if (!dde) {
BP_SET_DEDUP(bp, 0);
zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
}
return (zio);
}