diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index b3b7f8655..ecff65f13 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -78,6 +78,7 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 #define METASLAB_FASTWRITE 0x40 +#define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 688974f17..41e8ffa79 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -526,6 +526,40 @@ memory that is the threshold. Default value: \fB25 percent\fR .RE +.sp +.ne 2 +.na +\fBzfs_metaslab_try_hard_before_gang\fR (int) +.ad +.RS 12n +If not set (the default), we will first try normal allocation. +If that fails then we will do a gang allocation. +If that fails then we will do a "try hard" gang allocation. +If that fails then we will have a multi-layer gang block. +.sp +If set, we will first try normal allocation. +If that fails then we will do a "try hard" allocation. +If that fails we will do a gang allocation. +If that fails we will do a "try hard" gang allocation. +If that fails then we will have a multi-layer gang block. +.sp +Default value: \fB0 (false)\fR +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_find_max_tries\fR (int) +.ad +.RS 12n +When not trying hard, we only consider this number of the best metaslabs. +This improves performance, especially when there are many metaslabs per vdev +and the allocation can't actually be satisfied (so we would otherwise iterate +all the metaslabs). +.sp +Default value: \fB100\fR +.RE + .sp .ne 2 .na diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index ea1720a2c..bed6bf64c 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -264,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2; * Internal switch to enable/disable the metaslab allocation tracing * facility. */ -#ifdef _METASLAB_TRACING -boolean_t metaslab_trace_enabled = B_TRUE; -#endif +boolean_t metaslab_trace_enabled = B_FALSE; /* * Maximum entries that the metaslab allocation tracing facility will keep @@ -276,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE; * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ -#ifdef _METASLAB_TRACING uint64_t metaslab_trace_max_entries = 5000; -#endif /* * Maximum number of metaslabs per group that can be disabled @@ -314,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE; */ uint32_t metaslab_by_size_min_shift = 14; +/* + * If not set, we will first try normal allocation. If that fails then + * we will do a gang allocation. If that fails then we will do a "try hard" + * gang allocation. If that fails then we will have a multi-layer gang + * block. + * + * If set, we will first try normal allocation. If that fails then + * we will do a "try hard" allocation. If that fails we will do a gang + * allocation. If that fails we will do a "try hard" gang allocation. If + * that fails then we will have a multi-layer gang block. + */ +int zfs_metaslab_try_hard_before_gang = B_FALSE; + +/* + * When not trying hard, we only consider the best zfs_metaslab_find_max_tries + * metaslabs. This improves performance, especially when there are many + * metaslabs per vdev and the allocation can't actually be satisfied (so we + * would otherwise iterate all the metaslabs). If there is a metaslab with a + * worse weight but it can actually satisfy the allocation, we won't find it + * until trying hard. This may happen if the worse metaslab is not loaded + * (and the true weight is better than we have calculated), or due to weight + * bucketization. E.g. we are looking for a 60K segment, and the best + * metaslabs all have free segments in the 32-63K bucket, but the best + * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a + * subsequent metaslab has ms_max_size >60KB (but fewer segments in this + * bucket, and therefore a lower weight). + */ +int zfs_metaslab_find_max_tries = 100; + static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -325,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); -#ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; - kstat_named_t metaslabstat_df_find_under_floor; kstat_named_t metaslabstat_reload_tree; + kstat_named_t metaslabstat_too_many_tries; + kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, - { "df_find_under_floor", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, + { "too_many_tries", KSTAT_DATA_UINT64 }, + { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ @@ -373,18 +399,6 @@ metaslab_stat_fini(void) kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } -#else - -void -metaslab_stat_init(void) -{ -} - -void -metaslab_stat_fini(void) -{ -} -#endif /* * ========================================================================== @@ -1355,9 +1369,7 @@ static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; -#ifdef _METASLAB_TRACING METASLABSTAT_BUMP(metaslabstat_reload_tree); -#endif ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; @@ -1667,13 +1679,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ -#ifdef _METASLAB_TRACING - metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; - if (size < (1 << mrap->mra_floor_shift)) { - METASLABSTAT_BUMP( - metaslabstat_df_find_under_floor); - } -#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } @@ -4404,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * Metaslab allocation tracing facility * ========================================================================== */ -#ifdef _METASLAB_TRACING /* * Add an allocation trace element to the allocation tracing list. @@ -4479,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) list_destroy(&zal->zal_list); zal->zal_size = 0; } -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ /* * ========================================================================== @@ -4634,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); + int tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; + + if (!try_hard && tries > zfs_metaslab_find_max_tries) { + METASLABSTAT_BUMP(metaslabstat_too_many_tries); + return (NULL); + } + tries++; + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5287,9 +5284,12 @@ next: } while ((mg = mg->mg_next) != rotor); /* - * If we haven't tried hard, do so now. + * If we haven't tried hard, perhaps do so now. */ - if (!try_hard) { + if (!try_hard && (zfs_metaslab_try_hard_before_gang || + GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || + psize <= 1 << spa->spa_min_ashift)) { + METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } @@ -6245,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, + ZMOD_RW, "Try hard to allocate before ganging"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, + "Normally only consider this many of the best metaslabs in each vdev"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ba438353a..3c2b731f7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3585,17 +3585,16 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, - cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % - spa->spa_alloc_count); + int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count; + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) { *slog = TRUE; } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL, cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); + error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) *slog = FALSE; }