mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-26 20:22:14 +03:00
Improve log spacemap load time
Previous flushing algorithm limited only total number of log blocks to the minimum of 256K and 4x number of metaslabs in the pool. As result, system with 1500 disks with 1000 metaslabs each, touching several new metaslabs each TXG could grow spacemap log to huge size without much benefits. We've observed one of such systems importing pool for about 45 minutes. This patch improves the situation from five sides: - By limiting maximum period for each metaslab to be flushed to 1000 TXGs, that effectively limits maximum number of per-TXG spacemap logs to load to the same number. - By making flushing more smooth via accounting number of metaslabs that were touched after the last flush and actually need another flush, not just ms_unflushed_txg bump. - By applying zfs_unflushed_log_block_pct to the number of metaslabs that were touched after the last flush, not all metaslabs in the pool. - By aggressively prefetching per-TXG spacemap logs up to 16 TXGs in advance, making log spacemap load process for wide HDD pool CPU-bound, accelerating it by many times. - By reducing zfs_unflushed_log_block_max from 256K to 128K, reducing single-threaded by nature log processing time from ~10 to ~5 minutes. As further optimization we could skip bumping ms_unflushed_txg for metaslabs not touched since the last flush, but that would be an incompatible change, requiring new pool feature. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #12789
This commit is contained in:
+76
-56
@@ -2750,7 +2750,8 @@ metaslab_fini_flush_data(metaslab_t *msp)
|
||||
mutex_exit(&spa->spa_flushed_ms_lock);
|
||||
|
||||
spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
|
||||
spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
|
||||
spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
|
||||
metaslab_unflushed_dirty(msp));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
@@ -3728,6 +3729,61 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
|
||||
metaslab_flush_update(msp, tx);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
ASSERT(spa_syncing_log_sm(spa) != NULL);
|
||||
ASSERT(msp->ms_sm != NULL);
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
|
||||
|
||||
mutex_enter(&spa->spa_flushed_ms_lock);
|
||||
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
|
||||
metaslab_set_unflushed_dirty(msp, B_TRUE);
|
||||
avl_add(&spa->spa_metaslabs_by_flushed, msp);
|
||||
mutex_exit(&spa->spa_flushed_ms_lock);
|
||||
|
||||
spa_log_sm_increment_current_mscount(spa);
|
||||
spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
|
||||
{
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
ASSERT(spa_syncing_log_sm(spa) != NULL);
|
||||
ASSERT(msp->ms_sm != NULL);
|
||||
ASSERT(metaslab_unflushed_txg(msp) != 0);
|
||||
ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
|
||||
|
||||
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
|
||||
|
||||
/* update metaslab's position in our flushing tree */
|
||||
uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
|
||||
boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
|
||||
mutex_enter(&spa->spa_flushed_ms_lock);
|
||||
avl_remove(&spa->spa_metaslabs_by_flushed, msp);
|
||||
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
|
||||
metaslab_set_unflushed_dirty(msp, dirty);
|
||||
avl_add(&spa->spa_metaslabs_by_flushed, msp);
|
||||
mutex_exit(&spa->spa_flushed_ms_lock);
|
||||
|
||||
/* update metaslab counts of spa_log_sm_t nodes */
|
||||
spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
|
||||
spa_log_sm_increment_current_mscount(spa);
|
||||
|
||||
/* update log space map summary */
|
||||
spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
|
||||
ms_prev_flushed_dirty);
|
||||
spa_log_summary_add_flushed_metaslab(spa, dirty);
|
||||
|
||||
/* cleanup obsolete logs if any */
|
||||
spa_cleanup_old_sm_logs(spa, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when the metaslab has been flushed (its own spacemap now reflects
|
||||
* all the contents of the pool-wide spacemap log). Updates the metaslab's
|
||||
@@ -3743,8 +3799,6 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
ASSERT3U(spa_sync_pass(spa), ==, 1);
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
|
||||
|
||||
/*
|
||||
* Just because a metaslab got flushed, that doesn't mean that
|
||||
@@ -3757,39 +3811,11 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
|
||||
* We may end up here from metaslab_condense() without the
|
||||
* feature being active. In that case this is a no-op.
|
||||
*/
|
||||
if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
|
||||
if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
|
||||
metaslab_unflushed_txg(msp) == 0)
|
||||
return;
|
||||
|
||||
ASSERT(spa_syncing_log_sm(spa) != NULL);
|
||||
ASSERT(msp->ms_sm != NULL);
|
||||
ASSERT(metaslab_unflushed_txg(msp) != 0);
|
||||
ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
|
||||
|
||||
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
|
||||
|
||||
/* update metaslab's position in our flushing tree */
|
||||
uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
|
||||
mutex_enter(&spa->spa_flushed_ms_lock);
|
||||
avl_remove(&spa->spa_metaslabs_by_flushed, msp);
|
||||
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
|
||||
avl_add(&spa->spa_metaslabs_by_flushed, msp);
|
||||
mutex_exit(&spa->spa_flushed_ms_lock);
|
||||
|
||||
/* update metaslab counts of spa_log_sm_t nodes */
|
||||
spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
|
||||
spa_log_sm_increment_current_mscount(spa);
|
||||
|
||||
/* cleanup obsolete logs if any */
|
||||
uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
|
||||
spa_cleanup_old_sm_logs(spa, tx);
|
||||
uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
|
||||
VERIFY3U(log_blocks_after, <=, log_blocks_before);
|
||||
|
||||
/* update log space map summary */
|
||||
uint64_t blocks_gone = log_blocks_before - log_blocks_after;
|
||||
spa_log_summary_add_flushed_metaslab(spa);
|
||||
spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
|
||||
spa_log_summary_decrement_blkcount(spa, blocks_gone);
|
||||
metaslab_unflushed_bump(msp, tx, B_FALSE);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
@@ -4005,23 +4031,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
ASSERT0(metaslab_allocated_space(msp));
|
||||
}
|
||||
|
||||
if (metaslab_unflushed_txg(msp) == 0 &&
|
||||
spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
|
||||
ASSERT(spa_syncing_log_sm(spa) != NULL);
|
||||
|
||||
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
|
||||
spa_log_sm_increment_current_mscount(spa);
|
||||
spa_log_summary_add_flushed_metaslab(spa);
|
||||
|
||||
ASSERT(msp->ms_sm != NULL);
|
||||
mutex_enter(&spa->spa_flushed_ms_lock);
|
||||
avl_add(&spa->spa_metaslabs_by_flushed, msp);
|
||||
mutex_exit(&spa->spa_flushed_ms_lock);
|
||||
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
|
||||
}
|
||||
|
||||
if (!range_tree_is_empty(msp->ms_checkpointing) &&
|
||||
vd->vdev_checkpoint_sm == NULL) {
|
||||
ASSERT(spa_has_checkpoint(spa));
|
||||
@@ -4069,6 +4078,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||
space_map_t *log_sm = spa_syncing_log_sm(spa);
|
||||
if (log_sm != NULL) {
|
||||
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
|
||||
if (metaslab_unflushed_txg(msp) == 0)
|
||||
metaslab_unflushed_add(msp, tx);
|
||||
else if (!metaslab_unflushed_dirty(msp))
|
||||
metaslab_unflushed_bump(msp, tx, B_TRUE);
|
||||
|
||||
space_map_write(log_sm, alloctree, SM_ALLOC,
|
||||
vd->vdev_id, tx);
|
||||
@@ -6131,6 +6144,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
|
||||
mutex_exit(&mg->mg_ms_disabled_lock);
|
||||
}
|
||||
|
||||
void
|
||||
metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
|
||||
{
|
||||
ms->ms_unflushed_dirty = dirty;
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
|
||||
{
|
||||
@@ -6167,15 +6186,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
|
||||
void
|
||||
metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
|
||||
|
||||
if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
|
||||
return;
|
||||
|
||||
ms->ms_unflushed_txg = txg;
|
||||
metaslab_update_ondisk_flush_data(ms, tx);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
metaslab_unflushed_dirty(metaslab_t *ms)
|
||||
{
|
||||
return (ms->ms_unflushed_dirty);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
metaslab_unflushed_txg(metaslab_t *ms)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user