From 893a6d62c1895f3e3eeb660b048236571995a564 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 6 Jun 2019 19:10:43 -0700 Subject: [PATCH] Allow metaslab to be unloaded even when not freed from On large systems, the memory used by loaded metaslabs can become a concern. While range trees are a fairly efficient data structure, on heavily fragmented pools they can still consume a significant amount of memory. This problem is amplified when we fail to unload metaslabs that we aren't using. Currently, we only unload a metaslab during metaslab_sync_done; in order for that function to be called on a given metaslab in a given txg, we have to have dirtied that metaslab in that txg. If the dirtying was the result of an allocation, we wouldn't be unloading it (since it wouldn't be 8 txgs since it was selected), so in effect we only unload a metaslab during txgs where it's being freed from. We move the unload logic from sync_done to a new function, and call that function on all metaslabs in a given vdev during vdev_sync_done(). Reviewed-by: Richard Elling Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #8837 --- include/sys/metaslab.h | 1 + module/zfs/metaslab.c | 47 ++++++++++++++++++++++-------------------- module/zfs/vdev.c | 14 +++++++++++++ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 2790d06c7..330902529 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -50,6 +50,7 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, void metaslab_fini(metaslab_t *); int metaslab_load(metaslab_t *); +void metaslab_potentially_unload(metaslab_t *, uint64_t); void metaslab_unload(metaslab_t *); uint64_t metaslab_allocated_space(metaslab_t *); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d1d5a243f..41cbaad5f 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -2949,6 +2949,30 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_tx_commit(tx); } +void +metaslab_potentially_unload(metaslab_t *msp, uint64_t txg) +{ + /* + * If the metaslab is loaded and we've not tried to load or allocate + * from it in 'metaslab_unload_delay' txgs, then unload it. + */ + if (msp->ms_loaded && + msp->ms_disabled == 0 && + msp->ms_selected_txg + metaslab_unload_delay < txg) { + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } + + if (!metaslab_debug_unload) + metaslab_unload(msp); + } +} + /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. @@ -3086,27 +3110,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ metaslab_recalculate_weight_and_sort(msp); - /* - * If the metaslab is loaded and we've not tried to load or allocate - * from it in 'metaslab_unload_delay' txgs, then unload it. - */ - if (msp->ms_loaded && - msp->ms_disabled == 0 && - msp->ms_selected_txg + metaslab_unload_delay < txg) { - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( - msp->ms_allocating[(txg + t) & TXG_MASK])); - } - if (msp->ms_allocator != -1) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - } - - if (!metaslab_debug_unload) - metaslab_unload(msp); - } - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 1c4812cd8..81ef87e25 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3234,6 +3234,20 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) != NULL) metaslab_sync_done(msp, txg); + /* + * Because this function is only called on dirty vdevs, it's possible + * we won't consider all metaslabs for unloading on every + * txg. However, unless the system is largely idle it is likely that + * we will dirty all vdevs within a few txgs. + */ + for (int i = 0; i < vd->vdev_ms_count; i++) { + msp = vd->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + if (msp->ms_sm != NULL) + metaslab_potentially_unload(msp, txg); + mutex_exit(&msp->ms_lock); + } + if (reassess) metaslab_sync_reassess(vd->vdev_mg); }