mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Cap metaslab memory usage
On systems with large amounts of storage and high fragmentation, a huge amount of space can be used by storing metaslab range trees. Since metaslabs are only unloaded during a txg sync, and only if they have been inactive for 8 txgs, it is possible to get into a state where all of the system's memory is consumed by range trees and metaslabs, and txgs cannot sync. While ZFS knows how to evict ARC data when needed, it has no such mechanism for range tree data. This can result in boot hangs for some system configurations. First, we add the ability to unload metaslabs outside of syncing context. Second, we store a multilist of all loaded metaslabs, sorted by their selection txg, so we can quickly identify the oldest metaslabs. We use a multilist to reduce lock contention during heavy write workloads. Finally, we add logic that will unload a metaslab when we're loading a new metaslab, if we're using more than a certain fraction of the available memory on range trees. Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed-by: George Wilson <gwilson@delphix.com> Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <pcd@delphix.com> Closes #9128
This commit is contained in:
committed by
Brian Behlendorf
parent
9323aad14d
commit
f09fda5071
@@ -291,6 +291,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
|
||||
void arc_tempreserve_clear(uint64_t reserve);
|
||||
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
|
||||
|
||||
uint64_t arc_all_memory(void);
|
||||
uint64_t arc_target_bytes(void);
|
||||
void arc_init(void);
|
||||
void arc_fini(void);
|
||||
|
||||
@@ -57,7 +57,6 @@ int metaslab_sort_by_flushed(const void *, const void *);
|
||||
uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
|
||||
|
||||
int metaslab_load(metaslab_t *);
|
||||
void metaslab_potentially_unload(metaslab_t *, uint64_t);
|
||||
void metaslab_unload(metaslab_t *);
|
||||
boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
|
||||
|
||||
@@ -110,7 +109,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
|
||||
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
|
||||
zio_t *, int);
|
||||
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
|
||||
|
||||
void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
|
||||
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_space(metaslab_class_t *);
|
||||
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
|
||||
@@ -133,7 +132,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
|
||||
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
|
||||
void metaslab_recalculate_weight_and_sort(metaslab_t *);
|
||||
void metaslab_disable(metaslab_t *);
|
||||
void metaslab_enable(metaslab_t *, boolean_t);
|
||||
void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
|
||||
void metaslab_set_selected_txg(metaslab_t *, uint64_t);
|
||||
|
||||
extern int metaslab_debug_load;
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
#include <sys/vdev.h>
|
||||
#include <sys/txg.h>
|
||||
#include <sys/avl.h>
|
||||
#include <sys/multilist.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -194,6 +195,12 @@ struct metaslab_class {
|
||||
uint64_t mc_space; /* total space (alloc + free) */
|
||||
uint64_t mc_dspace; /* total deflated space */
|
||||
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
|
||||
|
||||
/*
|
||||
* List of all loaded metaslabs in the class, sorted in order of most
|
||||
* recent use.
|
||||
*/
|
||||
multilist_t *mc_metaslab_txg_list;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -378,6 +385,7 @@ struct metaslab {
|
||||
range_tree_t *ms_allocating[TXG_SIZE];
|
||||
range_tree_t *ms_allocatable;
|
||||
uint64_t ms_allocated_this_txg;
|
||||
uint64_t ms_allocating_total;
|
||||
|
||||
/*
|
||||
* The following range trees are accessed only from syncing context.
|
||||
@@ -508,6 +516,10 @@ struct metaslab {
|
||||
avl_node_t ms_group_node; /* node in metaslab group tree */
|
||||
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
|
||||
avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
|
||||
/*
|
||||
* Node in metaslab class's selected txg list
|
||||
*/
|
||||
multilist_node_t ms_class_txg_node;
|
||||
|
||||
/*
|
||||
* Allocs and frees that are committed to the vdev log spacemap but
|
||||
|
||||
Reference in New Issue
Block a user