ddt: dedup table quota enforcement

This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool

When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.

dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.

Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
This commit is contained in:
Allan Jude
2024-07-25 12:47:36 -04:00
committed by GitHub
parent 82f281ad99
commit c7ada64bb6
22 changed files with 599 additions and 22 deletions
+120 -6
View File
@@ -101,6 +101,22 @@
* object and (if necessary), removed from an old one. ddt_tree is cleared and
* the next txg can start.
*
* ## Dedup quota
*
* A maximum size for all DDTs on the pool can be set with the
* dedup_table_quota property. This is determined in ddt_over_quota() and
* enforced during ddt_lookup(). If the pool is at or over its quota limit,
* ddt_lookup() will only return entries for existing blocks, as updates are
* still possible. New entries will not be created; instead, ddt_lookup() will
* return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
* the D bit on the block and reissue the IO as a regular write. The block will
* not be deduplicated.
*
* Note that this is based on the on-disk size of the dedup store. Reclaiming
* this space after deleting entries relies on the ZAP "shrinking" behaviour,
* without which, no space would be recovered and the DDT would continue to be
* considered "over quota". See zap_shrink_enabled.
*
* ## Repair IO
*
* If a read on a dedup block fails, but there are other copies of the block in
@@ -152,6 +168,13 @@ static kmem_cache_t *ddt_entry_cache;
*/
int zfs_dedup_prefetch = 0;
/*
* If the dedup class cannot satisfy a DDT allocation, treat as over quota
* for this many TXGs.
*/
uint_t dedup_class_wait_txgs = 5;
static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
};
@@ -554,8 +577,6 @@ ddt_alloc(const ddt_key_t *ddk)
static void
ddt_free(ddt_entry_t *dde)
{
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
@@ -575,9 +596,66 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
ddt_free(dde);
}
static boolean_t
ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc)
{
if (mc != NULL && metaslab_class_get_space(mc) > 0) {
/* Over quota if allocating outside of this special class */
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
dedup_class_wait_txgs) {
/* Waiting for some deferred frees to be processed */
return (B_TRUE);
}
/*
* We're considered over quota when we hit 85% full, or for
* larger drives, when there is less than 8GB free.
*/
uint64_t allocated = metaslab_class_get_alloc(mc);
uint64_t capacity = metaslab_class_get_space(mc);
uint64_t limit = MAX(capacity * 85 / 100,
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
return (allocated >= limit);
}
return (B_FALSE);
}
/*
* Check if the DDT is over its quota. This can be due to a few conditions:
* 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
* exceeds this limit
*
* 2. 'dedup_table_quota' property is set to automatic and
* a. the dedup or special allocation class could not satisfy a DDT
* allocation in a recent transaction
* b. the dedup or special allocation class has exceeded its 85% limit
*/
static boolean_t
ddt_over_quota(spa_t *spa)
{
if (spa->spa_dedup_table_quota == 0)
return (B_FALSE);
if (spa->spa_dedup_table_quota != UINT64_MAX)
return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
/*
* For automatic quota, table size is limited by dedup or special class
*/
if (ddt_special_over_quota(spa, spa_dedup_class(spa)))
return (B_TRUE);
else if (spa_special_has_ddt(spa) &&
ddt_special_over_quota(spa, spa_special_class(spa)))
return (B_TRUE);
return (B_FALSE);
}
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
{
spa_t *spa = ddt->ddt_spa;
ddt_key_t search;
ddt_entry_t *dde;
ddt_type_t type;
@@ -592,13 +670,28 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
/* Find an existing live entry */
dde = avl_find(&ddt->ddt_tree, &search, &where);
if (dde != NULL) {
/* Found it. If it's already loaded, we can just return it. */
/* If we went over quota, act like we didn't find it */
if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
return (NULL);
/* If it's already loaded, we can just return it. */
if (dde->dde_flags & DDE_FLAG_LOADED)
return (dde);
/* Someone else is loading it, wait for it. */
dde->dde_waiters++;
while (!(dde->dde_flags & DDE_FLAG_LOADED))
cv_wait(&dde->dde_cv, &ddt->ddt_lock);
dde->dde_waiters--;
/* Loaded but over quota, forget we were ever here */
if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
if (dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(dde);
}
return (NULL);
}
return (dde);
}
@@ -639,14 +732,27 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
if (error == 0)
if (dde->dde_type == DDT_TYPES &&
dde->dde_class == DDT_CLASSES &&
ddt_over_quota(spa)) {
/* Over quota. If no one is waiting, clean up right now. */
if (dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(dde);
return (NULL);
}
/* Flag cleanup required */
dde->dde_flags |= DDE_FLAG_OVERQUOTA;
} else if (error == 0) {
ddt_stat_update(ddt, dde, -1ULL);
}
/* Entry loaded, everyone can proceed now */
dde->dde_flags |= DDE_FLAG_LOADED;
cv_broadcast(&dde->dde_cv);
return (dde);
return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde);
}
void
@@ -775,6 +881,7 @@ ddt_load(spa_t *spa)
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL;
spa->spa_dedup_dsize = ~0ULL;
}
return (0);
@@ -1032,6 +1139,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL;
spa->spa_dedup_dsize = ~0ULL;
}
void
@@ -1123,7 +1231,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
ASSERT3P(dde, !=, NULL);
/* Can be NULL if the entry for this block was pruned. */
if (dde == NULL) {
ddt_exit(ddt);
spa_config_exit(spa, SCL_ZIO, FTAG);
return (B_FALSE);
}
if (dde->dde_type < DDT_TYPES) {
ddt_phys_t *ddp;
+44 -6
View File
@@ -129,7 +129,8 @@ ddt_histogram_empty(const ddt_histogram_t *ddh)
void
ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
{
/* Sum the statistics we cached in ddt_object_sync(). */
memset(ddo_total, 0, sizeof (*ddo_total));
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (!ddt)
@@ -138,8 +139,32 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
dmu_object_info_t doi;
uint64_t cnt;
int err;
/*
* These stats were originally calculated
* during ddt_object_load().
*/
err = ddt_object_info(ddt, type, class, &doi);
if (err != 0)
continue;
err = ddt_object_count(ddt, type, class, &cnt);
if (err != 0)
continue;
ddt_object_t *ddo =
&ddt->ddt_object_stats[type][class];
ddo->ddo_count = cnt;
ddo->ddo_dspace =
doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count *
doi.doi_data_block_size;
ddo_total->ddo_count += ddo->ddo_count;
ddo_total->ddo_dspace += ddo->ddo_dspace;
ddo_total->ddo_mspace += ddo->ddo_mspace;
@@ -147,11 +172,24 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
}
}
/* ... and compute the averages. */
if (ddo_total->ddo_count != 0) {
ddo_total->ddo_dspace /= ddo_total->ddo_count;
ddo_total->ddo_mspace /= ddo_total->ddo_count;
}
/*
* This returns raw counts (not averages). One of the consumers,
* print_dedup_stats(), historically has expected raw counts.
*/
spa->spa_dedup_dsize = ddo_total->ddo_dspace;
}
uint64_t
ddt_get_ddt_dsize(spa_t *spa)
{
ddt_object_t ddo_total;
/* recalculate after each txg sync */
if (spa->spa_dedup_dsize == ~0ULL)
ddt_get_dedup_object_stats(spa, &ddo_total);
return (spa->spa_dedup_dsize);
}
void
+14
View File
@@ -406,6 +406,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
brt_get_ratio(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
ddt_get_ddt_dsize(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
rvd->vdev_state, src);
@@ -672,6 +675,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
error = SET_ERROR(EINVAL);
break;
case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
error = nvpair_value_uint64(elem, &intval);
break;
case ZPOOL_PROP_DELEGATION:
case ZPOOL_PROP_AUTOREPLACE:
case ZPOOL_PROP_LISTSNAPS:
@@ -4732,6 +4739,8 @@ spa_ld_get_props(spa_t *spa)
spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
&spa->spa_dedup_table_quota);
spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
spa->spa_autoreplace = (autoreplace != 0);
@@ -6588,6 +6597,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
spa->spa_dedup_table_quota =
zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
@@ -9631,6 +9642,9 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
case ZPOOL_PROP_MULTIHOST:
spa->spa_multihost = intval;
break;
case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
spa->spa_dedup_table_quota = intval;
break;
default:
break;
}
+7
View File
@@ -1996,6 +1996,13 @@ spa_dedup_class(spa_t *spa)
return (spa->spa_dedup_class);
}
boolean_t
spa_special_has_ddt(spa_t *spa)
{
return (zfs_ddt_data_is_special &&
spa->spa_special_class->mc_groups != 0);
}
/*
* Locate an appropriate allocation class
*/
+29
View File
@@ -3503,6 +3503,15 @@ zio_ddt_write(zio_t *zio)
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
if (dde == NULL) {
/* DDT size is over its quota so no new entries */
zp->zp_dedup = B_FALSE;
BP_SET_DEDUP(bp, B_FALSE);
if (zio->io_bp_override == NULL)
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (zio);
}
ddp = &dde->dde_phys[p];
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
@@ -3727,6 +3736,26 @@ zio_dva_allocate(zio_t *zio)
* Fallback to normal class when an alloc class is full
*/
if (error == ENOSPC && mc != spa_normal_class(spa)) {
/*
* When the dedup or special class is spilling into the normal
* class, there can still be significant space available due
* to deferred frees that are in-flight. We track the txg when
* this occurred and back off adding new DDT entries for a few
* txgs to allow the free blocks to be processed.
*/
if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
mc == spa_special_class(spa))) &&
spa->spa_dedup_class_full_txg != zio->io_txg) {
spa->spa_dedup_class_full_txg = zio->io_txg;
zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
"%llu allocated of %llu",
spa_name(spa), (int)zio->io_txg,
mc == spa_dedup_class(spa) ? "dedup" : "special",
(int)zio->io_size,
(u_longlong_t)metaslab_class_get_alloc(mc),
(u_longlong_t)metaslab_class_get_space(mc));
}
/*
* If throttling, transfer reservation over to normal class.
* The io_allocator slot can remain the same even though we