From 5e10fb6d7288927205a62ac2662f9863e7911bb9 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 Sep 2025 09:35:35 -0700 Subject: [PATCH] Add interface to interface spa_get_worst_case_min_alloc() function Provide an interface to retrieve the lowest and highest minimum allocation size for the normal allocation class. This can be used by external consumers of the DMU to estimate potential wasted capacity when setting the recordsize for an object. The new "min_alloc" and "max_alloc" keys are added to the pool configuration and used by default_volblocksize() to warn when an ineffecient block size is requested. For older kmods which don't yet include the new keys fallback to the previous logic. Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #17758 --- cmd/zfs/zfs_main.c | 28 +++++++++++++++++++++------- include/sys/fs/zfs.h | 2 ++ include/sys/spa.h | 1 + include/sys/spa_impl.h | 1 + module/zfs/spa_config.c | 2 ++ module/zfs/spa_misc.c | 15 +++++++++++++++ module/zfs/vdev.c | 15 ++++++++------- module/zfs/vdev_label.c | 2 ++ 8 files changed, 52 insertions(+), 14 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9f81292f0..c28eb8aca 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -912,19 +912,15 @@ usage: } /* - * Return a default volblocksize for the pool which always uses more than - * half of the data sectors. This primarily applies to dRAID which always - * writes full stripe widths. + * Calculate the minimum allocation size based on the top-level vdevs. */ static uint64_t -default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +calculate_volblocksize(nvlist_t *config) { - uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + uint64_t asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; - nvlist_t *config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { @@ -955,6 +951,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) } } + return (asize); +} + +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0) + asize = calculate_volblocksize(config); + /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4329e4e86..004e1795c 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -722,6 +722,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc" +#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 6611141b9..76e4446d3 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1029,6 +1029,7 @@ extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index c7ecd3d0c..a5c2eb0c7 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -250,6 +250,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_max_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index a77874ea0..41ab5162c 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -460,6 +460,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc); + fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 649fe2f63..d2983ba15 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -775,6 +775,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_max_alloc = 0; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ @@ -1796,6 +1797,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } +/* + * Return the range of minimum allocation sizes for the normal allocation + * class. This can be used by external consumers of the DMU to estimate + * potential wasted capacity when setting the recordsize for an object. + * This is mainly for dRAID pools which always pad to a full stripe width. + */ +void +spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc) +{ + *min_alloc = spa->spa_min_alloc; + *max_alloc = spa->spa_max_alloc; +} + /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be @@ -2980,6 +2994,7 @@ EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */ EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 82763c125..7d6d4606d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1428,12 +1428,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; - if (spa->spa_gcd_alloc == INT_MAX) { + + if (min_alloc > spa->spa_max_alloc) + spa->spa_max_alloc = min_alloc; + + if (spa->spa_gcd_alloc == INT_MAX) spa->spa_gcd_alloc = min_alloc; - } else { - spa->spa_gcd_alloc = vdev_gcd(min_alloc, - spa->spa_gcd_alloc); - } + else + spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } void @@ -1487,8 +1489,7 @@ vdev_metaslab_group_create(vdev_t *vd) if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; - uint64_t min_alloc = vdev_get_min_alloc(vd); - vdev_spa_set_alloc(spa, min_alloc); + vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); } } } diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 5c0e750c4..a64cd1690 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -498,6 +498,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_MIN_ALLOC, + vdev_get_min_alloc(vd)); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,