From 108562344c5c50e0a6d8bf4af0c33998fb0bbf6e Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 31 May 2025 19:12:16 -0400 Subject: [PATCH] Improve allocation fallback handling Before this change in case of any allocation error ZFS always fallen back to normal class. But with more of different classes available we migth want more sophisticated logic. For example, it makes sense to fall back from dedup first to special class (if it is allowed to put DDT there) and only then to normal, since in a pool with dedup and special classes populated normal class likely has performance characteristics unsuitable for dedup. This change implements general mechanism where fallback order is controlled by the same spa_preferred_class() as the initial class selection. And as first application it implements the mentioned dedup->special->normal fallbacks. I have more plans for it later. Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17391 --- include/sys/spa.h | 2 ++ module/zfs/ddt.c | 49 +++++++++++++++++++--------------------- module/zfs/spa_misc.c | 32 ++++++++++++++++++-------- module/zfs/zio.c | 52 +++++++++++++++++++++---------------------- 4 files changed, 74 insertions(+), 61 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 7dbb37406..a3e36c1f5 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1116,7 +1116,9 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_dedup(spa_t *spa); extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_has_special(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index d7326972c..60cbb7755 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1037,29 +1037,18 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) ddt_free(ddt, dde); } +/* + * We're considered over quota when we hit 85% full, or for larger drives, + * when there is less than 8GB free. + */ static boolean_t -ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc) +ddt_special_over_quota(metaslab_class_t *mc) { - if (mc != NULL && metaslab_class_get_space(mc) > 0) { - /* Over quota if allocating outside of this special class */ - if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg + - dedup_class_wait_txgs) { - /* Waiting for some deferred frees to be processed */ - return (B_TRUE); - } - - /* - * We're considered over quota when we hit 85% full, or for - * larger drives, when there is less than 8GB free. - */ - uint64_t allocated = metaslab_class_get_alloc(mc); - uint64_t capacity = metaslab_class_get_space(mc); - uint64_t limit = MAX(capacity * 85 / 100, - (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); - - return (allocated >= limit); - } - return (B_FALSE); + uint64_t allocated = metaslab_class_get_alloc(mc); + uint64_t capacity = metaslab_class_get_space(mc); + uint64_t limit = MAX(capacity * 85 / 100, + (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); + return (allocated >= limit); } /* @@ -1081,14 +1070,22 @@ ddt_over_quota(spa_t *spa) if (spa->spa_dedup_table_quota != UINT64_MAX) return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota); + /* + * Over quota if have to allocate outside of the dedup/special class. + */ + if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg + + dedup_class_wait_txgs) { + /* Waiting for some deferred frees to be processed */ + return (B_TRUE); + } + /* * For automatic quota, table size is limited by dedup or special class */ - if (ddt_special_over_quota(spa, spa_dedup_class(spa))) - return (B_TRUE); - else if (spa_special_has_ddt(spa) && - ddt_special_over_quota(spa, spa_special_class(spa))) - return (B_TRUE); + if (spa_has_dedup(spa)) + return (ddt_special_over_quota(spa_dedup_class(spa))); + else if (spa_special_has_ddt(spa)) + return (ddt_special_over_quota(spa_special_class(spa))); return (B_FALSE); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a2cfee2de..f054e4290 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2009,8 +2009,7 @@ spa_dedup_class(spa_t *spa) boolean_t spa_special_has_ddt(spa_t *spa) { - return (zfs_ddt_data_is_special && - spa->spa_special_class->mc_groups != 0); + return (zfs_ddt_data_is_special && spa_has_special(spa)); } /* @@ -2019,6 +2018,9 @@ spa_special_has_ddt(spa_t *spa) metaslab_class_t * spa_preferred_class(spa_t *spa, const zio_t *zio) { + metaslab_class_t *mc = zio->io_metaslab_class; + boolean_t tried_dedup = (mc == spa_dedup_class(spa)); + boolean_t tried_special = (mc == spa_special_class(spa)); const zio_prop_t *zp = &zio->io_prop; /* @@ -2036,12 +2038,10 @@ spa_preferred_class(spa_t *spa, const zio_t *zio) */ ASSERT(objtype != DMU_OT_INTENT_LOG); - boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; - if (DMU_OT_IS_DDT(objtype)) { - if (spa->spa_dedup_class->mc_groups != 0) + if (spa_has_dedup(spa) && !tried_dedup && !tried_special) return (spa_dedup_class(spa)); - else if (has_special_class && zfs_ddt_data_is_special) + else if (spa_special_has_ddt(spa) && !tried_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); @@ -2050,14 +2050,15 @@ spa_preferred_class(spa_t *spa, const zio_t *zio) /* Indirect blocks for user data can land in special if allowed */ if (zp->zp_level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { - if (has_special_class && zfs_user_indirect_is_special) + if (zfs_user_indirect_is_special && spa_has_special(spa) && + !tried_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { - if (has_special_class) + if (spa_has_special(spa) && !tried_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); @@ -2069,7 +2070,8 @@ spa_preferred_class(spa_t *spa, const zio_t *zio) * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if ((DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL) && - has_special_class && zio->io_size <= zp->zp_zpl_smallblk) { + spa_has_special(spa) && !tried_special && + zio->io_size <= zp->zp_zpl_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); @@ -2640,6 +2642,12 @@ spa_fini(void) mutex_destroy(&spa_l2cache_lock); } +boolean_t +spa_has_dedup(spa_t *spa) +{ + return (spa->spa_dedup_class->mc_groups != 0); +} + /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for @@ -2651,6 +2659,12 @@ spa_has_slogs(spa_t *spa) return (spa->spa_log_class->mc_groups != 0); } +boolean_t +spa_has_special(spa_t *spa) +{ + return (spa->spa_special_class->mc_groups != 0); +} + spa_log_state_t spa_get_log_state(spa_t *spa) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ca84f919e..554de6b7d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4150,7 +4150,7 @@ static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; - metaslab_class_t *mc; + metaslab_class_t *mc, *newmc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; @@ -4193,7 +4193,7 @@ zio_dva_allocate(zio_t *zio) again: /* * Try allocating the block in the usual metaslab class. - * If that's full, allocate it in the normal class. + * If that's full, allocate it in some other class(es). * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * @@ -4208,29 +4208,29 @@ again: &zio->io_alloc_list, zio->io_allocator, zio); /* - * Fallback to normal class when an alloc class is full + * When the dedup or special class is spilling into the normal class, + * there can still be significant space available due to deferred + * frees that are in-flight. We track the txg when this occurred and + * back off adding new DDT entries for a few txgs to allow the free + * blocks to be processed. */ - if (error == ENOSPC && mc != spa_normal_class(spa)) { - /* - * When the dedup or special class is spilling into the normal - * class, there can still be significant space available due - * to deferred frees that are in-flight. We track the txg when - * this occurred and back off adding new DDT entries for a few - * txgs to allow the free blocks to be processed. - */ - if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) && - mc == spa_special_class(spa))) && - spa->spa_dedup_class_full_txg != zio->io_txg) { - spa->spa_dedup_class_full_txg = zio->io_txg; - zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, " - "%llu allocated of %llu", - spa_name(spa), (int)zio->io_txg, - mc == spa_dedup_class(spa) ? "dedup" : "special", - (int)zio->io_size, - (u_longlong_t)metaslab_class_get_alloc(mc), - (u_longlong_t)metaslab_class_get_space(mc)); - } + if (error == ENOSPC && spa->spa_dedup_class_full_txg != zio->io_txg && + (mc == spa_dedup_class(spa) || (mc == spa_special_class(spa) && + !spa_has_dedup(spa) && spa_special_has_ddt(spa)))) { + spa->spa_dedup_class_full_txg = zio->io_txg; + zfs_dbgmsg("%s[%llu]: %s class spilling, req size %llu, " + "%llu allocated of %llu", + spa_name(spa), (u_longlong_t)zio->io_txg, + mc == spa_dedup_class(spa) ? "dedup" : "special", + (u_longlong_t)zio->io_size, + (u_longlong_t)metaslab_class_get_alloc(mc), + (u_longlong_t)metaslab_class_get_space(mc)); + } + /* + * Fall back to some other class when this one is full. + */ + if (error == ENOSPC && (newmc = spa_preferred_class(spa, zio)) != mc) { /* * If we are holding old class reservation, drop it. * Dispatch the next ZIO(s) there if some are waiting. @@ -4246,15 +4246,15 @@ again: if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " - "trying normal class: zio %px, size %llu, error %d", + "trying fallback: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } - zio->io_metaslab_class = mc = spa_normal_class(spa); + zio->io_metaslab_class = mc = newmc; ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); /* - * If normal class uses throttling, return to that pipeline + * If the new class uses throttling, return to that pipeline * stage. Otherwise just do another allocation attempt. */ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&