mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-25 09:25:00 +03:00 
			
		
		
		
	Initialize metaslab range trees in metaslab_init
= Motivation
We've noticed several zloop crashes within Delphix generated
due to the following sequence of events:
- A device gets expanded and new metaslabas are allocated for
  it. These metaslabs go through `metaslab_init()` but haven't
  gone through `metaslab_sync_done()` yet. This meas that the
  only range tree that's actually set is the `ms_allocatable`.
  All the others are NULL.
- A vdev_initialization is issues and `vdev_initialize_thread`
  starts processing one of these new metaslabs of the expanded
  vdev.
- As part of `vdev_initialize_calculate_progress()` we call
  into `metaslab_load()` and `metaslab_load_impl()` which
  in turn tries to dereference the metaslabs trees that
  are still NULL and therefore we crash.
The same failure can come up from the `vdev_trim` code paths.
= This Patch
We considered the following solutions to deal with this issue:
[A] Add logic to `vdev_initialize/trim` to skip those new
    metaslabs. We decided against this as it would be good
    to avoid exposing this lower-level detail to higer-level
    operations.
[B] Have `metaslab_load_impl()` return early for new metaslabs
    and thus never touch those range_trees that are NULL at
    that time. This seemed more of a work-around for the bug
    and not a clear-cut solution.
[C] Refactor our logic so all metaslabs have their range_trees
    created at the time of their creatin in `metaslab_init()`.
In this patch we decided to go with [C] because:
(1) It doesn't expose more metaslab details to higher level
    operations such as vdev initialize and trim.
(2) The current behavior of creating the range trees lazily
    in `metaslab_sync_done()` is unnecessarily complicated.
(3) Always initializing the metaslab range_trees makes other
    parts of the codebase cleaner. For example, we used to
    use `ms_freed` as the reference value for knowing whether
    all the range_trees have been initialized. Now we no
    longer need to do that check in most places (and in the
    few that we do we use the `ms_new` boolean field now
    which is more readable).
= Side Changes
Probably due to a mismerge we set `ms_loaded` to `B_TRUE` twice
in `metasloab_load_impl()`. In this patch we remove the extraneous
assignment.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #11737
			
			
This commit is contained in:
		
							parent
							
								
									ffd6978ef5
								
							
						
					
					
						commit
						793c958f6f
					
				| @ -2316,18 +2316,13 @@ metaslab_load_impl(metaslab_t *msp) | |||||||
| 		range_tree_add(msp->ms_allocatable, | 		range_tree_add(msp->ms_allocatable, | ||||||
| 		    msp->ms_start, msp->ms_size); | 		    msp->ms_start, msp->ms_size); | ||||||
| 
 | 
 | ||||||
| 		if (msp->ms_freed != NULL) { | 		if (msp->ms_new) { | ||||||
| 			/*
 | 			/*
 | ||||||
| 			 * If the ms_sm doesn't exist, this means that this | 			 * If the ms_sm doesn't exist, this means that this | ||||||
| 			 * metaslab hasn't gone through metaslab_sync() and | 			 * metaslab hasn't gone through metaslab_sync() and | ||||||
| 			 * thus has never been dirtied. So we shouldn't | 			 * thus has never been dirtied. So we shouldn't | ||||||
| 			 * expect any unflushed allocs or frees from previous | 			 * expect any unflushed allocs or frees from previous | ||||||
| 			 * TXGs. | 			 * TXGs. | ||||||
| 			 * |  | ||||||
| 			 * Note: ms_freed and all the other trees except for |  | ||||||
| 			 * the ms_allocatable, can be NULL at this point only |  | ||||||
| 			 * if this is a new metaslab of a vdev that just got |  | ||||||
| 			 * expanded. |  | ||||||
| 			 */ | 			 */ | ||||||
| 			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); | 			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); | ||||||
| 			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); | 			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); | ||||||
| @ -2365,8 +2360,6 @@ metaslab_load_impl(metaslab_t *msp) | |||||||
| 	range_tree_walk(msp->ms_unflushed_frees, | 	range_tree_walk(msp->ms_unflushed_frees, | ||||||
| 	    range_tree_add, msp->ms_allocatable); | 	    range_tree_add, msp->ms_allocatable); | ||||||
| 
 | 
 | ||||||
| 	msp->ms_loaded = B_TRUE; |  | ||||||
| 
 |  | ||||||
| 	ASSERT3P(msp->ms_group, !=, NULL); | 	ASSERT3P(msp->ms_group, !=, NULL); | ||||||
| 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa; | 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa; | ||||||
| 	if (spa_syncing_log_sm(spa) != NULL) { | 	if (spa_syncing_log_sm(spa) != NULL) { | ||||||
| @ -2680,19 +2673,31 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, | |||||||
| 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm); | 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	range_seg_type_t type; |  | ||||||
| 	uint64_t shift, start; | 	uint64_t shift, start; | ||||||
| 	type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); | 	range_seg_type_t type = | ||||||
|  | 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift); | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * We create the ms_allocatable here, but we don't create the |  | ||||||
| 	 * other range trees until metaslab_sync_done().  This serves |  | ||||||
| 	 * two purposes: it allows metaslab_sync_done() to detect the |  | ||||||
| 	 * addition of new space; and for debugging, it ensures that |  | ||||||
| 	 * we'd data fault on any attempt to use this metaslab before |  | ||||||
| 	 * it's ready. |  | ||||||
| 	 */ |  | ||||||
| 	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); | 	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); | ||||||
|  | 	for (int t = 0; t < TXG_SIZE; t++) { | ||||||
|  | 		ms->ms_allocating[t] = range_tree_create(NULL, type, | ||||||
|  | 		    NULL, start, shift); | ||||||
|  | 	} | ||||||
|  | 	ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); | ||||||
|  | 	ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); | ||||||
|  | 	for (int t = 0; t < TXG_DEFER_SIZE; t++) { | ||||||
|  | 		ms->ms_defer[t] = range_tree_create(NULL, type, NULL, | ||||||
|  | 		    start, shift); | ||||||
|  | 	} | ||||||
|  | 	ms->ms_checkpointing = | ||||||
|  | 	    range_tree_create(NULL, type, NULL, start, shift); | ||||||
|  | 	ms->ms_unflushed_allocs = | ||||||
|  | 	    range_tree_create(NULL, type, NULL, start, shift); | ||||||
|  | 
 | ||||||
|  | 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); | ||||||
|  | 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size; | ||||||
|  | 	mrap->mra_floor_shift = metaslab_by_size_min_shift; | ||||||
|  | 	ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, | ||||||
|  | 	    type, mrap, start, shift); | ||||||
| 
 | 
 | ||||||
| 	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); | 	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); | ||||||
| 
 | 
 | ||||||
| @ -2765,13 +2770,13 @@ metaslab_fini(metaslab_t *msp) | |||||||
| 
 | 
 | ||||||
| 	mutex_enter(&msp->ms_lock); | 	mutex_enter(&msp->ms_lock); | ||||||
| 	VERIFY(msp->ms_group == NULL); | 	VERIFY(msp->ms_group == NULL); | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * If the range trees haven't been allocated, this metaslab hasn't | 	 * If this metaslab hasn't been through metaslab_sync_done() yet its | ||||||
| 	 * been through metaslab_sync_done() for the first time yet, so its |  | ||||||
| 	 * space hasn't been accounted for in its vdev and doesn't need to be | 	 * space hasn't been accounted for in its vdev and doesn't need to be | ||||||
| 	 * subtracted. | 	 * subtracted. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (msp->ms_freed != NULL) { | 	if (!msp->ms_new) { | ||||||
| 		metaslab_space_update(vd, mg->mg_class, | 		metaslab_space_update(vd, mg->mg_class, | ||||||
| 		    -metaslab_allocated_space(msp), 0, -msp->ms_size); | 		    -metaslab_allocated_space(msp), 0, -msp->ms_size); | ||||||
| 
 | 
 | ||||||
| @ -2782,27 +2787,24 @@ metaslab_fini(metaslab_t *msp) | |||||||
| 	metaslab_unload(msp); | 	metaslab_unload(msp); | ||||||
| 
 | 
 | ||||||
| 	range_tree_destroy(msp->ms_allocatable); | 	range_tree_destroy(msp->ms_allocatable); | ||||||
|  | 	range_tree_destroy(msp->ms_freeing); | ||||||
|  | 	range_tree_destroy(msp->ms_freed); | ||||||
| 
 | 
 | ||||||
| 	if (msp->ms_freed != NULL) { | 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, | ||||||
| 		range_tree_destroy(msp->ms_freeing); | 	    metaslab_unflushed_changes_memused(msp)); | ||||||
| 		range_tree_destroy(msp->ms_freed); | 	spa->spa_unflushed_stats.sus_memused -= | ||||||
|  | 	    metaslab_unflushed_changes_memused(msp); | ||||||
|  | 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); | ||||||
|  | 	range_tree_destroy(msp->ms_unflushed_allocs); | ||||||
|  | 	range_tree_destroy(msp->ms_checkpointing); | ||||||
|  | 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); | ||||||
|  | 	range_tree_destroy(msp->ms_unflushed_frees); | ||||||
| 
 | 
 | ||||||
| 		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, | 	for (int t = 0; t < TXG_SIZE; t++) { | ||||||
| 		    metaslab_unflushed_changes_memused(msp)); | 		range_tree_destroy(msp->ms_allocating[t]); | ||||||
| 		spa->spa_unflushed_stats.sus_memused -= | 	} | ||||||
| 		    metaslab_unflushed_changes_memused(msp); | 	for (int t = 0; t < TXG_DEFER_SIZE; t++) { | ||||||
| 		range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); | 		range_tree_destroy(msp->ms_defer[t]); | ||||||
| 		range_tree_destroy(msp->ms_unflushed_allocs); |  | ||||||
| 		range_tree_destroy(msp->ms_checkpointing); |  | ||||||
| 		range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); |  | ||||||
| 		range_tree_destroy(msp->ms_unflushed_frees); |  | ||||||
| 
 |  | ||||||
| 		for (int t = 0; t < TXG_SIZE; t++) { |  | ||||||
| 			range_tree_destroy(msp->ms_allocating[t]); |  | ||||||
| 		} |  | ||||||
| 		for (int t = 0; t < TXG_DEFER_SIZE; t++) { |  | ||||||
| 			range_tree_destroy(msp->ms_defer[t]); |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 	ASSERT0(msp->ms_deferspace); | 	ASSERT0(msp->ms_deferspace); | ||||||
| 
 | 
 | ||||||
| @ -3926,17 +3928,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) | |||||||
| 	/*
 | 	/*
 | ||||||
| 	 * This metaslab has just been added so there's no work to do now. | 	 * This metaslab has just been added so there's no work to do now. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (msp->ms_freeing == NULL) { | 	if (msp->ms_new) { | ||||||
| 		ASSERT3P(alloctree, ==, NULL); | 		ASSERT0(range_tree_space(alloctree)); | ||||||
|  | 		ASSERT0(range_tree_space(msp->ms_freeing)); | ||||||
|  | 		ASSERT0(range_tree_space(msp->ms_freed)); | ||||||
|  | 		ASSERT0(range_tree_space(msp->ms_checkpointing)); | ||||||
|  | 		ASSERT0(range_tree_space(msp->ms_trim)); | ||||||
| 		return; | 		return; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	ASSERT3P(alloctree, !=, NULL); |  | ||||||
| 	ASSERT3P(msp->ms_freeing, !=, NULL); |  | ||||||
| 	ASSERT3P(msp->ms_freed, !=, NULL); |  | ||||||
| 	ASSERT3P(msp->ms_checkpointing, !=, NULL); |  | ||||||
| 	ASSERT3P(msp->ms_trim, !=, NULL); |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Normally, we don't want to process a metaslab if there are no | 	 * Normally, we don't want to process a metaslab if there are no | ||||||
| 	 * allocations or frees to perform. However, if the metaslab is being | 	 * allocations or frees to perform. However, if the metaslab is being | ||||||
| @ -4240,54 +4240,15 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) | |||||||
| 
 | 
 | ||||||
| 	mutex_enter(&msp->ms_lock); | 	mutex_enter(&msp->ms_lock); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	if (msp->ms_new) { | ||||||
| 	 * If this metaslab is just becoming available, initialize its | 		/* this is a new metaslab, add its capacity to the vdev */ | ||||||
| 	 * range trees and add its capacity to the vdev. |  | ||||||
| 	 */ |  | ||||||
| 	if (msp->ms_freed == NULL) { |  | ||||||
| 		range_seg_type_t type; |  | ||||||
| 		uint64_t shift, start; |  | ||||||
| 		type = metaslab_calculate_range_tree_type(vd, msp, &start, |  | ||||||
| 		    &shift); |  | ||||||
| 
 |  | ||||||
| 		for (int t = 0; t < TXG_SIZE; t++) { |  | ||||||
| 			ASSERT(msp->ms_allocating[t] == NULL); |  | ||||||
| 
 |  | ||||||
| 			msp->ms_allocating[t] = range_tree_create(NULL, type, |  | ||||||
| 			    NULL, start, shift); |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		ASSERT3P(msp->ms_freeing, ==, NULL); |  | ||||||
| 		msp->ms_freeing = range_tree_create(NULL, type, NULL, start, |  | ||||||
| 		    shift); |  | ||||||
| 
 |  | ||||||
| 		ASSERT3P(msp->ms_freed, ==, NULL); |  | ||||||
| 		msp->ms_freed = range_tree_create(NULL, type, NULL, start, |  | ||||||
| 		    shift); |  | ||||||
| 
 |  | ||||||
| 		for (int t = 0; t < TXG_DEFER_SIZE; t++) { |  | ||||||
| 			ASSERT3P(msp->ms_defer[t], ==, NULL); |  | ||||||
| 			msp->ms_defer[t] = range_tree_create(NULL, type, NULL, |  | ||||||
| 			    start, shift); |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		ASSERT3P(msp->ms_checkpointing, ==, NULL); |  | ||||||
| 		msp->ms_checkpointing = range_tree_create(NULL, type, NULL, |  | ||||||
| 		    start, shift); |  | ||||||
| 
 |  | ||||||
| 		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); |  | ||||||
| 		msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, |  | ||||||
| 		    start, shift); |  | ||||||
| 
 |  | ||||||
| 		metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); |  | ||||||
| 		mrap->mra_bt = &msp->ms_unflushed_frees_by_size; |  | ||||||
| 		mrap->mra_floor_shift = metaslab_by_size_min_shift; |  | ||||||
| 		ASSERT3P(msp->ms_unflushed_frees, ==, NULL); |  | ||||||
| 		msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, |  | ||||||
| 		    type, mrap, start, shift); |  | ||||||
| 
 |  | ||||||
| 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); | 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); | ||||||
|  | 
 | ||||||
|  | 		/* there should be no allocations nor frees at this point */ | ||||||
|  | 		VERIFY0(msp->ms_allocated_this_txg); | ||||||
|  | 		VERIFY0(range_tree_space(msp->ms_freed)); | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
| 	ASSERT0(range_tree_space(msp->ms_freeing)); | 	ASSERT0(range_tree_space(msp->ms_freeing)); | ||||||
| 	ASSERT0(range_tree_space(msp->ms_checkpointing)); | 	ASSERT0(range_tree_space(msp->ms_checkpointing)); | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Serapheim Dimitropoulos
						Serapheim Dimitropoulos