Improve allocation fallback handling

Before this change in case of any allocation error ZFS always fallen
back to normal class.  But with more of different classes available
we migth want more sophisticated logic.  For example, it makes sense
to fall back from dedup first to special class (if it is allowed to
put DDT there) and only then to normal, since in a pool with dedup
and special classes populated normal class likely has performance
characteristics unsuitable for dedup.

This change implements general mechanism where fallback order is
controlled by the same spa_preferred_class() as the initial class
selection.  And as first application it implements the mentioned
dedup->special->normal fallbacks.  I have more plans for it later.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #17391
This commit is contained in:
Alexander Motin 2025-05-31 19:12:16 -04:00 committed by GitHub
parent e0edfcbd4e
commit 108562344c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 74 additions and 61 deletions

View File

@ -1116,7 +1116,9 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_dedup(spa_t *spa);
extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_has_special(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa);

View File

@ -1037,30 +1037,19 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
ddt_free(ddt, dde); ddt_free(ddt, dde);
} }
static boolean_t
ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc)
{
if (mc != NULL && metaslab_class_get_space(mc) > 0) {
/* Over quota if allocating outside of this special class */
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
dedup_class_wait_txgs) {
/* Waiting for some deferred frees to be processed */
return (B_TRUE);
}
/* /*
* We're considered over quota when we hit 85% full, or for * We're considered over quota when we hit 85% full, or for larger drives,
* larger drives, when there is less than 8GB free. * when there is less than 8GB free.
*/ */
static boolean_t
ddt_special_over_quota(metaslab_class_t *mc)
{
uint64_t allocated = metaslab_class_get_alloc(mc); uint64_t allocated = metaslab_class_get_alloc(mc);
uint64_t capacity = metaslab_class_get_space(mc); uint64_t capacity = metaslab_class_get_space(mc);
uint64_t limit = MAX(capacity * 85 / 100, uint64_t limit = MAX(capacity * 85 / 100,
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
return (allocated >= limit); return (allocated >= limit);
} }
return (B_FALSE);
}
/* /*
* Check if the DDT is over its quota. This can be due to a few conditions: * Check if the DDT is over its quota. This can be due to a few conditions:
@ -1081,14 +1070,22 @@ ddt_over_quota(spa_t *spa)
if (spa->spa_dedup_table_quota != UINT64_MAX) if (spa->spa_dedup_table_quota != UINT64_MAX)
return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota); return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
/*
* Over quota if have to allocate outside of the dedup/special class.
*/
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
dedup_class_wait_txgs) {
/* Waiting for some deferred frees to be processed */
return (B_TRUE);
}
/* /*
* For automatic quota, table size is limited by dedup or special class * For automatic quota, table size is limited by dedup or special class
*/ */
if (ddt_special_over_quota(spa, spa_dedup_class(spa))) if (spa_has_dedup(spa))
return (B_TRUE); return (ddt_special_over_quota(spa_dedup_class(spa)));
else if (spa_special_has_ddt(spa) && else if (spa_special_has_ddt(spa))
ddt_special_over_quota(spa, spa_special_class(spa))) return (ddt_special_over_quota(spa_special_class(spa)));
return (B_TRUE);
return (B_FALSE); return (B_FALSE);
} }

View File

@ -2009,8 +2009,7 @@ spa_dedup_class(spa_t *spa)
boolean_t boolean_t
spa_special_has_ddt(spa_t *spa) spa_special_has_ddt(spa_t *spa)
{ {
return (zfs_ddt_data_is_special && return (zfs_ddt_data_is_special && spa_has_special(spa));
spa->spa_special_class->mc_groups != 0);
} }
/* /*
@ -2019,6 +2018,9 @@ spa_special_has_ddt(spa_t *spa)
metaslab_class_t * metaslab_class_t *
spa_preferred_class(spa_t *spa, const zio_t *zio) spa_preferred_class(spa_t *spa, const zio_t *zio)
{ {
metaslab_class_t *mc = zio->io_metaslab_class;
boolean_t tried_dedup = (mc == spa_dedup_class(spa));
boolean_t tried_special = (mc == spa_special_class(spa));
const zio_prop_t *zp = &zio->io_prop; const zio_prop_t *zp = &zio->io_prop;
/* /*
@ -2036,12 +2038,10 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
*/ */
ASSERT(objtype != DMU_OT_INTENT_LOG); ASSERT(objtype != DMU_OT_INTENT_LOG);
boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
if (DMU_OT_IS_DDT(objtype)) { if (DMU_OT_IS_DDT(objtype)) {
if (spa->spa_dedup_class->mc_groups != 0) if (spa_has_dedup(spa) && !tried_dedup && !tried_special)
return (spa_dedup_class(spa)); return (spa_dedup_class(spa));
else if (has_special_class && zfs_ddt_data_is_special) else if (spa_special_has_ddt(spa) && !tried_special)
return (spa_special_class(spa)); return (spa_special_class(spa));
else else
return (spa_normal_class(spa)); return (spa_normal_class(spa));
@ -2050,14 +2050,15 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
/* Indirect blocks for user data can land in special if allowed */ /* Indirect blocks for user data can land in special if allowed */
if (zp->zp_level > 0 && if (zp->zp_level > 0 &&
(DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
if (has_special_class && zfs_user_indirect_is_special) if (zfs_user_indirect_is_special && spa_has_special(spa) &&
!tried_special)
return (spa_special_class(spa)); return (spa_special_class(spa));
else else
return (spa_normal_class(spa)); return (spa_normal_class(spa));
} }
if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) {
if (has_special_class) if (spa_has_special(spa) && !tried_special)
return (spa_special_class(spa)); return (spa_special_class(spa));
else else
return (spa_normal_class(spa)); return (spa_normal_class(spa));
@ -2069,7 +2070,8 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
* zfs_special_class_metadata_reserve_pct exclusively for metadata. * zfs_special_class_metadata_reserve_pct exclusively for metadata.
*/ */
if ((DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL) && if ((DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL) &&
has_special_class && zio->io_size <= zp->zp_zpl_smallblk) { spa_has_special(spa) && !tried_special &&
zio->io_size <= zp->zp_zpl_smallblk) {
metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *special = spa_special_class(spa);
uint64_t alloc = metaslab_class_get_alloc(special); uint64_t alloc = metaslab_class_get_alloc(special);
uint64_t space = metaslab_class_get_space(special); uint64_t space = metaslab_class_get_space(special);
@ -2640,6 +2642,12 @@ spa_fini(void)
mutex_destroy(&spa_l2cache_lock); mutex_destroy(&spa_l2cache_lock);
} }
boolean_t
spa_has_dedup(spa_t *spa)
{
return (spa->spa_dedup_class->mc_groups != 0);
}
/* /*
* Return whether this pool has a dedicated slog device. No locking needed. * Return whether this pool has a dedicated slog device. No locking needed.
* It's not a problem if the wrong answer is returned as it's only for * It's not a problem if the wrong answer is returned as it's only for
@ -2651,6 +2659,12 @@ spa_has_slogs(spa_t *spa)
return (spa->spa_log_class->mc_groups != 0); return (spa->spa_log_class->mc_groups != 0);
} }
boolean_t
spa_has_special(spa_t *spa)
{
return (spa->spa_special_class->mc_groups != 0);
}
spa_log_state_t spa_log_state_t
spa_get_log_state(spa_t *spa) spa_get_log_state(spa_t *spa)
{ {

View File

@ -4150,7 +4150,7 @@ static zio_t *
zio_dva_allocate(zio_t *zio) zio_dva_allocate(zio_t *zio)
{ {
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
metaslab_class_t *mc; metaslab_class_t *mc, *newmc;
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
int error; int error;
int flags = 0; int flags = 0;
@ -4193,7 +4193,7 @@ zio_dva_allocate(zio_t *zio)
again: again:
/* /*
* Try allocating the block in the usual metaslab class. * Try allocating the block in the usual metaslab class.
* If that's full, allocate it in the normal class. * If that's full, allocate it in some other class(es).
* If that's full, allocate as a gang block, * If that's full, allocate as a gang block,
* and if all are full, the allocation fails (which shouldn't happen). * and if all are full, the allocation fails (which shouldn't happen).
* *
@ -4208,29 +4208,29 @@ again:
&zio->io_alloc_list, zio->io_allocator, zio); &zio->io_alloc_list, zio->io_allocator, zio);
/* /*
* Fallback to normal class when an alloc class is full * When the dedup or special class is spilling into the normal class,
* there can still be significant space available due to deferred
* frees that are in-flight. We track the txg when this occurred and
* back off adding new DDT entries for a few txgs to allow the free
* blocks to be processed.
*/ */
if (error == ENOSPC && mc != spa_normal_class(spa)) { if (error == ENOSPC && spa->spa_dedup_class_full_txg != zio->io_txg &&
/* (mc == spa_dedup_class(spa) || (mc == spa_special_class(spa) &&
* When the dedup or special class is spilling into the normal !spa_has_dedup(spa) && spa_special_has_ddt(spa)))) {
* class, there can still be significant space available due
* to deferred frees that are in-flight. We track the txg when
* this occurred and back off adding new DDT entries for a few
* txgs to allow the free blocks to be processed.
*/
if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
mc == spa_special_class(spa))) &&
spa->spa_dedup_class_full_txg != zio->io_txg) {
spa->spa_dedup_class_full_txg = zio->io_txg; spa->spa_dedup_class_full_txg = zio->io_txg;
zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, " zfs_dbgmsg("%s[%llu]: %s class spilling, req size %llu, "
"%llu allocated of %llu", "%llu allocated of %llu",
spa_name(spa), (int)zio->io_txg, spa_name(spa), (u_longlong_t)zio->io_txg,
mc == spa_dedup_class(spa) ? "dedup" : "special", mc == spa_dedup_class(spa) ? "dedup" : "special",
(int)zio->io_size, (u_longlong_t)zio->io_size,
(u_longlong_t)metaslab_class_get_alloc(mc), (u_longlong_t)metaslab_class_get_alloc(mc),
(u_longlong_t)metaslab_class_get_space(mc)); (u_longlong_t)metaslab_class_get_space(mc));
} }
/*
* Fall back to some other class when this one is full.
*/
if (error == ENOSPC && (newmc = spa_preferred_class(spa, zio)) != mc) {
/* /*
* If we are holding old class reservation, drop it. * If we are holding old class reservation, drop it.
* Dispatch the next ZIO(s) there if some are waiting. * Dispatch the next ZIO(s) there if some are waiting.
@ -4246,15 +4246,15 @@ again:
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
zfs_dbgmsg("%s: metaslab allocation failure, " zfs_dbgmsg("%s: metaslab allocation failure, "
"trying normal class: zio %px, size %llu, error %d", "trying fallback: zio %px, size %llu, error %d",
spa_name(spa), zio, (u_longlong_t)zio->io_size, spa_name(spa), zio, (u_longlong_t)zio->io_size,
error); error);
} }
zio->io_metaslab_class = mc = spa_normal_class(spa); zio->io_metaslab_class = mc = newmc;
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
/* /*
* If normal class uses throttling, return to that pipeline * If the new class uses throttling, return to that pipeline
* stage. Otherwise just do another allocation attempt. * stage. Otherwise just do another allocation attempt.
*/ */
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&