diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1417ab5f2..088ef3c05 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. @@ -955,7 +955,7 @@ dump_metaslab_stats(metaslab_t *msp) /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); - zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 973f15d75..7dd5fe2b5 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -66,7 +66,7 @@ uint64_t metaslab_allocated_space(metaslab_t *); void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); -uint64_t metaslab_block_maxsize(metaslab_t *); +uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 29bc8cd5e..08ee8d279 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -475,6 +475,12 @@ struct metaslab { * stay cached. */ uint64_t ms_selected_txg; + /* + * ms_load/unload_time can be used for performance monitoring + * (e.g. by dtrace or mdb). + */ + hrtime_t ms_load_time; /* time last loaded */ + hrtime_t ms_unload_time; /* time last unloaded */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ @@ -495,6 +501,7 @@ struct metaslab { * segment sizes. */ avl_tree_t ms_allocatable_by_size; + avl_tree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index fce1df68d..e299613b8 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -89,6 +89,8 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 3b88d9748..6c15f11b7 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -370,6 +370,22 @@ larger). Use \fB1\fR for yes and \fB0\fR for no (default). .RE +.sp +.ne 2 +.na +\fBzfs_metaslab_max_size_cache_sec\fR (ulong) +.ad +.RS 12n +When we unload a metaslab, we cache the size of the largest free chunk. We use +that cached size to determine whether or not to load a metaslab for a given +allocation. As more frees accumulate in that metaslab while it's unloaded, the +cached max size becomes less and less accurate. After a number of seconds +controlled by this tunable, we stop considering the cached max size and start +considering only the histogram instead. +.sp +Default value: \fB3600 seconds\fR (one hour) +.RE + .sp .ne 2 .na diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 02b913780..9a9a5e0cf 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -272,6 +272,12 @@ uint64_t metaslab_trace_max_entries = 5000; */ int max_disabled_ms = 3; +/* + * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. + * To avoid 64-bit overflow, don't set above UINT32_MAX. + */ +unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ + static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -1165,17 +1171,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2) * Return the maximum contiguous segment within the metaslab. */ uint64_t -metaslab_block_maxsize(metaslab_t *msp) +metaslab_largest_allocatable(metaslab_t *msp) { avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; - if (t == NULL || (rs = avl_last(t)) == NULL) - return (0ULL); + if (t == NULL) + return (0); + rs = avl_last(t); + if (rs == NULL) + return (0); return (rs->rs_end - rs->rs_start); } +/* + * Return the maximum contiguous segment within the unflushed frees of this + * metaslab. + */ +uint64_t +metaslab_largest_unflushed_free(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (msp->ms_unflushed_frees == NULL) + return (0); + + range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (rs == NULL) + return (0); + + /* + * When a range is freed from the metaslab, that range is added to + * both the unflushed frees and the deferred frees. While the block + * will eventually be usable, if the metaslab were loaded the range + * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE + * txgs had passed. As a result, when attempting to estimate an upper + * bound for the largest currently-usable free segment in the + * metaslab, we need to not consider any ranges currently in the defer + * trees. This algorithm approximates the largest available chunk in + * the largest range in the unflushed_frees tree by taking the first + * chunk. While this may be a poor estimate, it should only remain so + * briefly and should eventually self-correct as frees are no longer + * deferred. Similar logic applies to the ms_freed tree. See + * metaslab_load() for more details. + * + * There are two primary sources of innacuracy in this estimate. Both + * are tolerated for performance reasons. The first source is that we + * only check the largest segment for overlaps. Smaller segments may + * have more favorable overlaps with the other trees, resulting in + * larger usable chunks. Second, we only look at the first chunk in + * the largest segment; there may be other usable chunks in the + * largest segment, but we ignore them. + */ + uint64_t rstart = rs->rs_start; + uint64_t rsize = rs->rs_end - rstart; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, + rsize, &start, &size); + if (found) { + if (rstart == start) + return (0); + rsize = start - rstart; + } + } + + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + rsize, &start, &size); + if (found) + rsize = start - rstart; + + return (rsize); +} + static range_seg_t * metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { @@ -1269,7 +1341,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ - if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || + if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { @@ -1375,7 +1447,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_block_maxsize(msp); + uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, @@ -1693,7 +1765,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) msp->ms_weight = 0; msp->ms_fragmentation = 0; - msp->ms_max_size = 0; /* * This function is used for verification purposes. Regardless of @@ -1883,18 +1954,21 @@ metaslab_load_impl(metaslab_t *msp) * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; + uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); - msp->ms_max_size = metaslab_block_maxsize(msp); - + msp->ms_max_size = metaslab_largest_allocatable(msp); + ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); + msp->ms_load_time = load_end; if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, " - "loading_time %lld ms", + "loading_time %lld ms, ms_max_size %llu, " + "max size error %llu", spa_syncing_txg(spa), spa_name(spa), msp->ms_group->mg_vd->vdev_id, msp->ms_id, space_map_length(msp->ms_sm), @@ -1903,7 +1977,8 @@ metaslab_load_impl(metaslab_t *msp) range_tree_space(msp->ms_freed), range_tree_space(msp->ms_defer[0]), range_tree_space(msp->ms_defer[1]), - (longlong_t)((load_end - load_start) / 1000000)); + (longlong_t)((load_end - load_start) / 1000000), + msp->ms_max_size, msp->ms_max_size - max_size); } metaslab_verify_space(msp, spa_syncing_txg(spa)); @@ -1967,10 +2042,10 @@ metaslab_unload(metaslab_t *msp) range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; - msp->ms_max_size = 0; /* * We explicitly recalculate the metaslab's weight based on its space @@ -2527,13 +2602,19 @@ metaslab_segment_weight(metaslab_t *msp) * weights we rely on the entire weight (excluding the weight-type bit). */ boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { - if (msp->ms_loaded) { + /* + * If the metaslab is loaded, ms_max_size is definitive and we can use + * the fast check. If it's not, the ms_max_size is a lower bound (once + * set), and we should use the fast check as long as we're not in + * try_hard and it's been less than zfs_metaslab_max_size_cache_sec + * seconds since the metaslab was unloaded. + */ + if (msp->ms_loaded || + (msp->ms_max_size != 0 && !try_hard && gethrtime() < + msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); - } else { - ASSERT0(msp->ms_max_size); - } boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { @@ -2571,14 +2652,21 @@ metaslab_weight(metaslab_t *msp) metaslab_set_fragmentation(msp); /* - * Update the maximum size if the metaslab is loaded. This will + * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space - * has been added back into the free tree. + * has been added back into the free tree. If the metaslab is + * unloaded, we check if there's a larger free segment in the + * unflushed frees. This is a lower bound on the largest allocatable + * segment size. Coalescing of adjacent entries may reveal larger + * allocatable segments, but we aren't aware of those until loading + * the space map into a range tree. */ - if (msp->ms_loaded) - msp->ms_max_size = metaslab_block_maxsize(msp); - else - ASSERT0(msp->ms_max_size); + if (msp->ms_loaded) { + msp->ms_max_size = metaslab_largest_allocatable(msp); + } else { + msp->ms_max_size = MAX(msp->ms_max_size, + metaslab_largest_unflushed_free(msp)); + } /* * Segment-based weighting requires space map histogram support. @@ -3595,7 +3683,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create(NULL, NULL); + msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, + &msp->ms_unflushed_frees_by_size, + metaslab_rangesize_compare, 0); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } @@ -3992,7 +4082,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ - msp->ms_max_size = metaslab_block_maxsize(msp); + msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } @@ -4010,7 +4100,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, - zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) + boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, + boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -4020,7 +4111,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; @@ -4100,8 +4191,8 @@ metaslab_active_mask_verify(metaslab_t *msp) /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -4174,8 +4265,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, zal, search, - &was_active); + want_unique, asize, allocator, try_hard, zal, + search, &was_active); } mutex_exit(&mg->mg_lock); @@ -4282,7 +4373,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -4360,7 +4451,7 @@ next: */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - weight = metaslab_block_maxsize(msp); + weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); @@ -4392,7 +4483,7 @@ next: * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } @@ -4403,14 +4494,14 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator); + dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -4592,7 +4683,7 @@ top: * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator); + !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* @@ -5615,6 +5706,10 @@ MODULE_PARM_DESC(metaslab_df_max_search, module_param(metaslab_df_use_largest_segment, int, 0644); MODULE_PARM_DESC(metaslab_df_use_largest_segment, "when looking in size tree, use largest segment instead of exact fit"); + +module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644); +MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec, + "how long to trust the cached max chunk size of a metaslab"); /* END CSTYLED */ #endif diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 5919236d9..0e1297214 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -524,6 +524,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) return (range_tree_find(rt, start, size) != NULL); } +/* + * Returns the first subset of the given range which overlaps with the range + * tree. Returns true if there is a segment in the range, and false if there + * isn't. + */ +boolean_t +range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize) +{ + range_seg_t rsearch; + rsearch.rs_start = start; + rsearch.rs_end = start + 1; + + avl_index_t where; + range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + if (rs != NULL) { + *ostart = start; + *osize = MIN(size, rs->rs_end - start); + return (B_TRUE); + } + + rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); + if (rs == NULL || rs->rs_start > start + size) + return (B_FALSE); + + *ostart = rs->rs_start; + *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + return (B_TRUE); +} + /* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree.