Keep more metaslabs loaded

With the other metaslab changes loaded onto a system, we can 
significantly reduce the memory usage of each loaded metaslab and 
unload them on demand if there is memory pressure. However, none 
of those changes actually result in us keeping more metaslabs loaded. 
If we don't keep more metaslabs loaded, we will still have to wait 
for demand-loading to finish when no loaded metaslab can satisfy our 
allocation, which can cause ZIL performance issues. In addition,
performance is traditionally measured by IOs per unit time, while 
unloading is currently done on a txg-count basis. Txgs can take a 
widely varying range of times, from tenths of a second to several 
seconds. This can result in confusing, hard to predict behavior.

This change simply adds a time-based component to metaslab unloading. 
A metaslab will remain loaded for one minute and 8 txgs (by default) 
after it was last used, unless it is evicted due to memory pressure.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
External-issue: DLPX-65016
External-issue: DLPX-65047
Closes #9197
This commit is contained in:
Paul Dagnelie 2019-08-29 10:20:36 -07:00 committed by Brian Behlendorf
parent e6cebbf86e
commit eef0f4d84e
3 changed files with 69 additions and 31 deletions

View File

@ -489,6 +489,7 @@ struct metaslab {
*/ */
hrtime_t ms_load_time; /* time last loaded */ hrtime_t ms_load_time; /* time last loaded */
hrtime_t ms_unload_time; /* time last unloaded */ hrtime_t ms_unload_time; /* time last unloaded */
hrtime_t ms_selected_time; /* time last allocated from */
uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
uint64_t ms_max_size; /* maximum allocatable size */ uint64_t ms_max_size; /* maximum allocatable size */

View File

@ -398,7 +398,7 @@ the least recently used metaslab to prevent the system from clogging all of
its memory with range trees. This tunable sets the percentage of total system its memory with range trees. This tunable sets the percentage of total system
memory that is the threshold. memory that is the threshold.
.sp .sp
Default value: \fB75 percent\fR Default value: \fB25 percent\fR
.RE .RE
.sp .sp
@ -469,6 +469,34 @@ angular velocity disk drive.
Use \fB1\fR for yes (default) and \fB0\fR for no. Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE .RE
.sp
.ne 2
.na
\fBmetaslab_unload_delay\fR (int)
.ad
.RS 12n
After a metaslab is used, we keep it loaded for this many txgs, to attempt to
reduce unnecessary reloading. Note that both this many txgs and
\fBmetaslab_unload_delay_ms\fR milliseconds must pass before unloading will
occur.
.sp
Default value: \fB32\fR.
.RE
.sp
.ne 2
.na
\fBmetaslab_unload_delay_ms\fR (int)
.ad
.RS 12n
After a metaslab is used, we keep it loaded for this many milliseconds, to
attempt to reduce unnecessary reloading. Note that both this many
milliseconds and \fBmetaslab_unload_delay\fR txgs must pass before unloading
will occur.
.sp
Default value: \fB600000\fR (ten minutes).
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -198,16 +198,20 @@ int metaslab_df_use_largest_segment = B_FALSE;
int metaslab_load_pct = 50; int metaslab_load_pct = 50;
/* /*
* Determines how many txgs a metaslab may remain loaded without having any * These tunables control how long a metaslab will remain loaded after the
* allocations from it. As long as a metaslab continues to be used we will * last allocation from it. A metaslab can't be unloaded until at least
* keep it loaded. * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
* have elapsed. However, zfs_metaslab_mem_limit may cause it to be
* unloaded sooner. These settings are intended to be generous -- to keep
* metaslabs loaded for a long time, reducing the rate of metaslab loading.
*/ */
int metaslab_unload_delay = TXG_SIZE * 2; int metaslab_unload_delay = 32;
int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
/* /*
* Max number of metaslabs per group to preload. * Max number of metaslabs per group to preload.
*/ */
int metaslab_preload_limit = SPA_DVAS_PER_BP; int metaslab_preload_limit = 10;
/* /*
* Enable/disable preloading of metaslab. * Enable/disable preloading of metaslab.
@ -272,18 +276,18 @@ uint64_t metaslab_trace_max_entries = 5000;
*/ */
int max_disabled_ms = 3; int max_disabled_ms = 3;
/*
* Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
* To avoid 64-bit overflow, don't set above UINT32_MAX.
*/
unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
/* /*
* Maximum percentage of memory to use on storing loaded metaslabs. If loading * Maximum percentage of memory to use on storing loaded metaslabs. If loading
* a metaslab would take it over this percentage, the oldest selected metaslab * a metaslab would take it over this percentage, the oldest selected metaslab
* is automatically unloaded. * is automatically unloaded.
*/ */
int zfs_metaslab_mem_limit = 75; int zfs_metaslab_mem_limit = 25;
/*
* Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
* To avoid 64-bit overflow, don't set above UINT32_MAX.
*/
unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
static uint64_t metaslab_weight(metaslab_t *); static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *);
@ -539,15 +543,6 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
multilist_sublist_unlock(mls); multilist_sublist_unlock(mls);
while (msp != NULL) { while (msp != NULL) {
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
/*
* Once we've hit a metaslab selected too recently to
* evict, we're done evicting for now.
*/
if (msp->ms_selected_txg + metaslab_unload_delay >=
txg) {
mutex_exit(&msp->ms_lock);
break;
}
/* /*
* If the metaslab has been removed from the list * If the metaslab has been removed from the list
@ -563,7 +558,20 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
mls = multilist_sublist_lock(ml, i); mls = multilist_sublist_lock(ml, i);
metaslab_t *next_msp = multilist_sublist_next(mls, msp); metaslab_t *next_msp = multilist_sublist_next(mls, msp);
multilist_sublist_unlock(mls); multilist_sublist_unlock(mls);
metaslab_evict(msp, txg); if (txg >
msp->ms_selected_txg + metaslab_unload_delay &&
gethrtime() > msp->ms_selected_time +
(uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
metaslab_evict(msp, txg);
} else {
/*
* Once we've hit a metaslab selected too
* recently to evict, we're done evicting for
* now.
*/
mutex_exit(&msp->ms_lock);
break;
}
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
msp = next_msp; msp = next_msp;
} }
@ -2248,6 +2256,7 @@ metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
if (multilist_link_active(&msp->ms_class_txg_node)) if (multilist_link_active(&msp->ms_class_txg_node))
multilist_sublist_remove(mls, msp); multilist_sublist_remove(mls, msp);
msp->ms_selected_txg = txg; msp->ms_selected_txg = txg;
msp->ms_selected_time = gethrtime();
multilist_sublist_insert_tail(mls, msp); multilist_sublist_insert_tail(mls, msp);
multilist_sublist_unlock(mls); multilist_sublist_unlock(mls);
} }
@ -2573,7 +2582,6 @@ metaslab_space_weight(metaslab_t *msp)
uint64_t weight, space; uint64_t weight, space;
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!vd->vdev_removing);
/* /*
* The baseline weight is the metaslab's free space. * The baseline weight is the metaslab's free space.
@ -2832,13 +2840,6 @@ metaslab_weight(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
/*
* If this vdev is in the process of being removed, there is nothing
* for us to do here.
*/
if (vd->vdev_removing)
return (0);
metaslab_set_fragmentation(msp); metaslab_set_fragmentation(msp);
/* /*
@ -5869,6 +5870,14 @@ module_param(metaslab_preload_enabled, int, 0644);
MODULE_PARM_DESC(metaslab_preload_enabled, MODULE_PARM_DESC(metaslab_preload_enabled,
"preload potential metaslabs during reassessment"); "preload potential metaslabs during reassessment");
module_param(metaslab_unload_delay, int, 0644);
MODULE_PARM_DESC(metaslab_unload_delay,
"delay in txgs after metaslab was last used before unloading");
module_param(metaslab_unload_delay_ms, int, 0644);
MODULE_PARM_DESC(metaslab_unload_delay_ms,
"delay in milliseconds after metaslab was last used before unloading");
module_param(zfs_mg_noalloc_threshold, int, 0644); module_param(zfs_mg_noalloc_threshold, int, 0644);
MODULE_PARM_DESC(zfs_mg_noalloc_threshold, MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
"percentage of free space for metaslab group to allow allocation"); "percentage of free space for metaslab group to allow allocation");