From be5c6d96530e19efde7c0af771f9ddb0073ef751 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 16 Dec 2020 14:40:05 -0800 Subject: [PATCH] Only examine best metaslabs on each vdev On a system with very high fragmentation, we may need to do lots of gang allocations (e.g. most indirect block allocations (~50KB) may need to gang). Before failing a "normal" allocation and resorting to ganging, we try every metaslab. This has the impact of loading every metaslab (not a huge deal since we now typically keep all metaslabs loaded), and also iterating over every metaslab for every failing allocation. If there are many metaslabs (more than the typical ~200, e.g. due to vdev expansion or very large vdevs), the CPU cost of this iteration can be very impactful. This iteration is done with the mg_lock held, creating long hold times and high lock contention for concurrent allocations, ultimately causing long txg sync times and poor application performance. To address this, this commit changes the behavior of "normal" (not try_hard, not ZIL) allocations. These will now only examine the 100 best metaslabs (as determined by their ms_weight). If none of these have a large enough free segment, then the allocation will fail and we'll fall back on ganging. To accomplish this, we will now (normally) gang before doing a `try_hard` allocation. Non-try_hard allocations will only examine the 100 best metaslabs of each vdev. In summary, we will first try normal allocation. If that fails then we will do a gang allocation. If that fails then we will do a "try hard" gang allocation. If that fails then we will have a multi-layer gang block. Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #11327 --- include/sys/metaslab.h | 1 + man/man5/zfs-module-parameters.5 | 34 +++++++++++ module/zfs/metaslab.c | 100 ++++++++++++++++--------------- module/zfs/zio.c | 15 +++-- 4 files changed, 95 insertions(+), 55 deletions(-) diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index b3b7f8655..ecff65f13 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -78,6 +78,7 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 #define METASLAB_FASTWRITE 0x40 +#define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 688974f17..41e8ffa79 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -526,6 +526,40 @@ memory that is the threshold. Default value: \fB25 percent\fR .RE +.sp +.ne 2 +.na +\fBzfs_metaslab_try_hard_before_gang\fR (int) +.ad +.RS 12n +If not set (the default), we will first try normal allocation. +If that fails then we will do a gang allocation. +If that fails then we will do a "try hard" gang allocation. +If that fails then we will have a multi-layer gang block. +.sp +If set, we will first try normal allocation. +If that fails then we will do a "try hard" allocation. +If that fails we will do a gang allocation. +If that fails we will do a "try hard" gang allocation. +If that fails then we will have a multi-layer gang block. +.sp +Default value: \fB0 (false)\fR +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_find_max_tries\fR (int) +.ad +.RS 12n +When not trying hard, we only consider this number of the best metaslabs. +This improves performance, especially when there are many metaslabs per vdev +and the allocation can't actually be satisfied (so we would otherwise iterate +all the metaslabs). +.sp +Default value: \fB100\fR +.RE + .sp .ne 2 .na diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ea1720a2c..bed6bf64c 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -264,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2; * Internal switch to enable/disable the metaslab allocation tracing * facility. */ -#ifdef _METASLAB_TRACING -boolean_t metaslab_trace_enabled = B_TRUE; -#endif +boolean_t metaslab_trace_enabled = B_FALSE; /* * Maximum entries that the metaslab allocation tracing facility will keep @@ -276,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE; * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ -#ifdef _METASLAB_TRACING uint64_t metaslab_trace_max_entries = 5000; -#endif /* * Maximum number of metaslabs per group that can be disabled @@ -314,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE; */ uint32_t metaslab_by_size_min_shift = 14; +/* + * If not set, we will first try normal allocation. If that fails then + * we will do a gang allocation. If that fails then we will do a "try hard" + * gang allocation. If that fails then we will have a multi-layer gang + * block. + * + * If set, we will first try normal allocation. If that fails then + * we will do a "try hard" allocation. If that fails we will do a gang + * allocation. If that fails we will do a "try hard" gang allocation. If + * that fails then we will have a multi-layer gang block. + */ +int zfs_metaslab_try_hard_before_gang = B_FALSE; + +/* + * When not trying hard, we only consider the best zfs_metaslab_find_max_tries + * metaslabs. This improves performance, especially when there are many + * metaslabs per vdev and the allocation can't actually be satisfied (so we + * would otherwise iterate all the metaslabs). If there is a metaslab with a + * worse weight but it can actually satisfy the allocation, we won't find it + * until trying hard. This may happen if the worse metaslab is not loaded + * (and the true weight is better than we have calculated), or due to weight + * bucketization. E.g. we are looking for a 60K segment, and the best + * metaslabs all have free segments in the 32-63K bucket, but the best + * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a + * subsequent metaslab has ms_max_size >60KB (but fewer segments in this + * bucket, and therefore a lower weight). + */ +int zfs_metaslab_find_max_tries = 100; + static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -325,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); -#ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; - kstat_named_t metaslabstat_df_find_under_floor; kstat_named_t metaslabstat_reload_tree; + kstat_named_t metaslabstat_too_many_tries; + kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, - { "df_find_under_floor", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, + { "too_many_tries", KSTAT_DATA_UINT64 }, + { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ @@ -373,18 +399,6 @@ metaslab_stat_fini(void) kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } -#else - -void -metaslab_stat_init(void) -{ -} - -void -metaslab_stat_fini(void) -{ -} -#endif /* * ========================================================================== @@ -1355,9 +1369,7 @@ static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; -#ifdef _METASLAB_TRACING METASLABSTAT_BUMP(metaslabstat_reload_tree); -#endif ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; @@ -1667,13 +1679,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ -#ifdef _METASLAB_TRACING - metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; - if (size < (1 << mrap->mra_floor_shift)) { - METASLABSTAT_BUMP( - metaslabstat_df_find_under_floor); - } -#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } @@ -4404,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * Metaslab allocation tracing facility * ========================================================================== */ -#ifdef _METASLAB_TRACING /* * Add an allocation trace element to the allocation tracing list. @@ -4479,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) list_destroy(&zal->zal_list); zal->zal_size = 0; } -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ /* * ========================================================================== @@ -4634,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); + int tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; + + if (!try_hard && tries > zfs_metaslab_find_max_tries) { + METASLABSTAT_BUMP(metaslabstat_too_many_tries); + return (NULL); + } + tries++; + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5287,9 +5284,12 @@ next: } while ((mg = mg->mg_next) != rotor); /* - * If we haven't tried hard, do so now. + * If we haven't tried hard, perhaps do so now. */ - if (!try_hard) { + if (!try_hard && (zfs_metaslab_try_hard_before_gang || + GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || + psize <= 1 << spa->spa_min_ashift)) { + METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } @@ -6245,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, + ZMOD_RW, "Try hard to allocate before ganging"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, + "Normally only consider this many of the best metaslabs in each vdev"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ba438353a..3c2b731f7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3585,17 +3585,16 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, - cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % - spa->spa_alloc_count); + int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count; + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) { *slog = TRUE; } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL, cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); + error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) *slog = FALSE; }