mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-06-25 02:28:01 +03:00
Improve allocation fallback handling
Before this change in case of any allocation error ZFS always fallen back to normal class. But with more of different classes available we migth want more sophisticated logic. For example, it makes sense to fall back from dedup first to special class (if it is allowed to put DDT there) and only then to normal, since in a pool with dedup and special classes populated normal class likely has performance characteristics unsuitable for dedup. This change implements general mechanism where fallback order is controlled by the same spa_preferred_class() as the initial class selection. And as first application it implements the mentioned dedup->special->normal fallbacks. I have more plans for it later. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Paul Dagnelie <pcd@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #17391
This commit is contained in:
parent
e0edfcbd4e
commit
108562344c
@ -1116,7 +1116,9 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
|
||||
extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
|
||||
extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
|
||||
extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
|
||||
extern boolean_t spa_has_dedup(spa_t *spa);
|
||||
extern boolean_t spa_has_slogs(spa_t *spa);
|
||||
extern boolean_t spa_has_special(spa_t *spa);
|
||||
extern boolean_t spa_is_root(spa_t *spa);
|
||||
extern boolean_t spa_writeable(spa_t *spa);
|
||||
extern boolean_t spa_has_pending_synctask(spa_t *spa);
|
||||
|
@ -1037,29 +1037,18 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
|
||||
ddt_free(ddt, dde);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc)
|
||||
{
|
||||
if (mc != NULL && metaslab_class_get_space(mc) > 0) {
|
||||
/* Over quota if allocating outside of this special class */
|
||||
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
|
||||
dedup_class_wait_txgs) {
|
||||
/* Waiting for some deferred frees to be processed */
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* We're considered over quota when we hit 85% full, or for
|
||||
* larger drives, when there is less than 8GB free.
|
||||
/*
|
||||
* We're considered over quota when we hit 85% full, or for larger drives,
|
||||
* when there is less than 8GB free.
|
||||
*/
|
||||
static boolean_t
|
||||
ddt_special_over_quota(metaslab_class_t *mc)
|
||||
{
|
||||
uint64_t allocated = metaslab_class_get_alloc(mc);
|
||||
uint64_t capacity = metaslab_class_get_space(mc);
|
||||
uint64_t limit = MAX(capacity * 85 / 100,
|
||||
(capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
|
||||
|
||||
return (allocated >= limit);
|
||||
}
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1081,14 +1070,22 @@ ddt_over_quota(spa_t *spa)
|
||||
if (spa->spa_dedup_table_quota != UINT64_MAX)
|
||||
return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
|
||||
|
||||
/*
|
||||
* Over quota if have to allocate outside of the dedup/special class.
|
||||
*/
|
||||
if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
|
||||
dedup_class_wait_txgs) {
|
||||
/* Waiting for some deferred frees to be processed */
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* For automatic quota, table size is limited by dedup or special class
|
||||
*/
|
||||
if (ddt_special_over_quota(spa, spa_dedup_class(spa)))
|
||||
return (B_TRUE);
|
||||
else if (spa_special_has_ddt(spa) &&
|
||||
ddt_special_over_quota(spa, spa_special_class(spa)))
|
||||
return (B_TRUE);
|
||||
if (spa_has_dedup(spa))
|
||||
return (ddt_special_over_quota(spa_dedup_class(spa)));
|
||||
else if (spa_special_has_ddt(spa))
|
||||
return (ddt_special_over_quota(spa_special_class(spa)));
|
||||
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
@ -2009,8 +2009,7 @@ spa_dedup_class(spa_t *spa)
|
||||
boolean_t
|
||||
spa_special_has_ddt(spa_t *spa)
|
||||
{
|
||||
return (zfs_ddt_data_is_special &&
|
||||
spa->spa_special_class->mc_groups != 0);
|
||||
return (zfs_ddt_data_is_special && spa_has_special(spa));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2019,6 +2018,9 @@ spa_special_has_ddt(spa_t *spa)
|
||||
metaslab_class_t *
|
||||
spa_preferred_class(spa_t *spa, const zio_t *zio)
|
||||
{
|
||||
metaslab_class_t *mc = zio->io_metaslab_class;
|
||||
boolean_t tried_dedup = (mc == spa_dedup_class(spa));
|
||||
boolean_t tried_special = (mc == spa_special_class(spa));
|
||||
const zio_prop_t *zp = &zio->io_prop;
|
||||
|
||||
/*
|
||||
@ -2036,12 +2038,10 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
|
||||
*/
|
||||
ASSERT(objtype != DMU_OT_INTENT_LOG);
|
||||
|
||||
boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
|
||||
|
||||
if (DMU_OT_IS_DDT(objtype)) {
|
||||
if (spa->spa_dedup_class->mc_groups != 0)
|
||||
if (spa_has_dedup(spa) && !tried_dedup && !tried_special)
|
||||
return (spa_dedup_class(spa));
|
||||
else if (has_special_class && zfs_ddt_data_is_special)
|
||||
else if (spa_special_has_ddt(spa) && !tried_special)
|
||||
return (spa_special_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
@ -2050,14 +2050,15 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
|
||||
/* Indirect blocks for user data can land in special if allowed */
|
||||
if (zp->zp_level > 0 &&
|
||||
(DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
|
||||
if (has_special_class && zfs_user_indirect_is_special)
|
||||
if (zfs_user_indirect_is_special && spa_has_special(spa) &&
|
||||
!tried_special)
|
||||
return (spa_special_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
}
|
||||
|
||||
if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) {
|
||||
if (has_special_class)
|
||||
if (spa_has_special(spa) && !tried_special)
|
||||
return (spa_special_class(spa));
|
||||
else
|
||||
return (spa_normal_class(spa));
|
||||
@ -2069,7 +2070,8 @@ spa_preferred_class(spa_t *spa, const zio_t *zio)
|
||||
* zfs_special_class_metadata_reserve_pct exclusively for metadata.
|
||||
*/
|
||||
if ((DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL) &&
|
||||
has_special_class && zio->io_size <= zp->zp_zpl_smallblk) {
|
||||
spa_has_special(spa) && !tried_special &&
|
||||
zio->io_size <= zp->zp_zpl_smallblk) {
|
||||
metaslab_class_t *special = spa_special_class(spa);
|
||||
uint64_t alloc = metaslab_class_get_alloc(special);
|
||||
uint64_t space = metaslab_class_get_space(special);
|
||||
@ -2640,6 +2642,12 @@ spa_fini(void)
|
||||
mutex_destroy(&spa_l2cache_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
spa_has_dedup(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_dedup_class->mc_groups != 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether this pool has a dedicated slog device. No locking needed.
|
||||
* It's not a problem if the wrong answer is returned as it's only for
|
||||
@ -2651,6 +2659,12 @@ spa_has_slogs(spa_t *spa)
|
||||
return (spa->spa_log_class->mc_groups != 0);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
spa_has_special(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_special_class->mc_groups != 0);
|
||||
}
|
||||
|
||||
spa_log_state_t
|
||||
spa_get_log_state(spa_t *spa)
|
||||
{
|
||||
|
@ -4150,7 +4150,7 @@ static zio_t *
|
||||
zio_dva_allocate(zio_t *zio)
|
||||
{
|
||||
spa_t *spa = zio->io_spa;
|
||||
metaslab_class_t *mc;
|
||||
metaslab_class_t *mc, *newmc;
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
int error;
|
||||
int flags = 0;
|
||||
@ -4193,7 +4193,7 @@ zio_dva_allocate(zio_t *zio)
|
||||
again:
|
||||
/*
|
||||
* Try allocating the block in the usual metaslab class.
|
||||
* If that's full, allocate it in the normal class.
|
||||
* If that's full, allocate it in some other class(es).
|
||||
* If that's full, allocate as a gang block,
|
||||
* and if all are full, the allocation fails (which shouldn't happen).
|
||||
*
|
||||
@ -4208,29 +4208,29 @@ again:
|
||||
&zio->io_alloc_list, zio->io_allocator, zio);
|
||||
|
||||
/*
|
||||
* Fallback to normal class when an alloc class is full
|
||||
* When the dedup or special class is spilling into the normal class,
|
||||
* there can still be significant space available due to deferred
|
||||
* frees that are in-flight. We track the txg when this occurred and
|
||||
* back off adding new DDT entries for a few txgs to allow the free
|
||||
* blocks to be processed.
|
||||
*/
|
||||
if (error == ENOSPC && mc != spa_normal_class(spa)) {
|
||||
/*
|
||||
* When the dedup or special class is spilling into the normal
|
||||
* class, there can still be significant space available due
|
||||
* to deferred frees that are in-flight. We track the txg when
|
||||
* this occurred and back off adding new DDT entries for a few
|
||||
* txgs to allow the free blocks to be processed.
|
||||
*/
|
||||
if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
|
||||
mc == spa_special_class(spa))) &&
|
||||
spa->spa_dedup_class_full_txg != zio->io_txg) {
|
||||
if (error == ENOSPC && spa->spa_dedup_class_full_txg != zio->io_txg &&
|
||||
(mc == spa_dedup_class(spa) || (mc == spa_special_class(spa) &&
|
||||
!spa_has_dedup(spa) && spa_special_has_ddt(spa)))) {
|
||||
spa->spa_dedup_class_full_txg = zio->io_txg;
|
||||
zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
|
||||
zfs_dbgmsg("%s[%llu]: %s class spilling, req size %llu, "
|
||||
"%llu allocated of %llu",
|
||||
spa_name(spa), (int)zio->io_txg,
|
||||
spa_name(spa), (u_longlong_t)zio->io_txg,
|
||||
mc == spa_dedup_class(spa) ? "dedup" : "special",
|
||||
(int)zio->io_size,
|
||||
(u_longlong_t)zio->io_size,
|
||||
(u_longlong_t)metaslab_class_get_alloc(mc),
|
||||
(u_longlong_t)metaslab_class_get_space(mc));
|
||||
}
|
||||
|
||||
/*
|
||||
* Fall back to some other class when this one is full.
|
||||
*/
|
||||
if (error == ENOSPC && (newmc = spa_preferred_class(spa, zio)) != mc) {
|
||||
/*
|
||||
* If we are holding old class reservation, drop it.
|
||||
* Dispatch the next ZIO(s) there if some are waiting.
|
||||
@ -4246,15 +4246,15 @@ again:
|
||||
|
||||
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
|
||||
zfs_dbgmsg("%s: metaslab allocation failure, "
|
||||
"trying normal class: zio %px, size %llu, error %d",
|
||||
"trying fallback: zio %px, size %llu, error %d",
|
||||
spa_name(spa), zio, (u_longlong_t)zio->io_size,
|
||||
error);
|
||||
}
|
||||
zio->io_metaslab_class = mc = spa_normal_class(spa);
|
||||
zio->io_metaslab_class = mc = newmc;
|
||||
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
|
||||
|
||||
/*
|
||||
* If normal class uses throttling, return to that pipeline
|
||||
* If the new class uses throttling, return to that pipeline
|
||||
* stage. Otherwise just do another allocation attempt.
|
||||
*/
|
||||
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
|
||||
|
Loading…
Reference in New Issue
Block a user