Make metaslab class rotor and aliquot per-allocator.

Metaslab rotor and aliquot are used to distribute workload between
vdevs while keeping some locality for logically adjacent blocks.  Once
multiple allocators were introduced to separate allocation of different
objects it does not make much sense for different allocators to write
into different metaslabs of the same metaslab group (vdev) same time,
competing for its resources.  This change makes each allocator choose
metaslab group independently, colliding with others only sporadically.

Test including simultaneous write into 4 files with recordsize of 4KB
on a striped pool of 30 disks on a system with 40 logical cores show
reduction of vdev queue lock contention from 54 to 27% due to better
load distribution.  Unfortunately it won't help much ZVOLs yet since
only one dataset/ZVOL is synced at a time, and so for the most part
only one allocator is used, but it may improve later.

While there, to reduce the number of pointer dereferences change
per-allocator storage for metaslab classes and groups from several
separate malloc()'s to variable length arrays at the ends of the
original class and group structures.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Closes #11288
This commit is contained in:
Alexander Motin 2020-12-15 13:55:44 -05:00 committed by GitHub
parent e2d952cda0
commit f8020c9363
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 116 additions and 100 deletions

View File

@ -6322,7 +6322,7 @@ dump_block_stats(spa_t *spa)
(void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
if (spa_special_class(spa)->mc_rotor != NULL) { if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
uint64_t alloc = metaslab_class_get_alloc( uint64_t alloc = metaslab_class_get_alloc(
spa_special_class(spa)); spa_special_class(spa));
uint64_t space = metaslab_class_get_space( uint64_t space = metaslab_class_get_space(
@ -6333,7 +6333,7 @@ dump_block_stats(spa_t *spa)
100.0 * alloc / space); 100.0 * alloc / space);
} }
if (spa_dedup_class(spa)->mc_rotor != NULL) { if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
uint64_t alloc = metaslab_class_get_alloc( uint64_t alloc = metaslab_class_get_alloc(
spa_dedup_class(spa)); spa_dedup_class(spa));
uint64_t space = metaslab_class_get_space( uint64_t space = metaslab_class_get_space(

View File

@ -3099,7 +3099,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
/* /*
* find the first real slog in log allocation class * find the first real slog in log allocation class
*/ */
mg = spa_log_class(spa)->mc_rotor; mg = spa_log_class(spa)->mc_allocator[0].mca_rotor;
while (!mg->mg_vd->vdev_islog) while (!mg->mg_vd->vdev_islog)
mg = mg->mg_next; mg = mg->mg_next;

View File

@ -136,6 +136,29 @@ typedef enum trace_alloc_type {
#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54)
#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x)
/*
* Per-allocator data structure.
*/
typedef struct metaslab_class_allocator {
metaslab_group_t *mca_rotor;
uint64_t mca_aliquot;
/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mca_alloc_slots
* refcount. The mca_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
uint64_t mca_alloc_max_slots;
zfs_refcount_t mca_alloc_slots;
} metaslab_class_allocator_t;
/* /*
* A metaslab class encompasses a category of allocatable top-level vdevs. * A metaslab class encompasses a category of allocatable top-level vdevs.
* Each top-level vdev is associated with a metaslab group which defines * Each top-level vdev is associated with a metaslab group which defines
@ -145,7 +168,7 @@ typedef enum trace_alloc_type {
* When a block allocation is requested from the SPA it is associated with a * When a block allocation is requested from the SPA it is associated with a
* metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
* to the class can be used to satisfy that request. Allocations are done * to the class can be used to satisfy that request. Allocations are done
* by traversing the metaslab groups that are linked off of the mc_rotor field. * by traversing the metaslab groups that are linked off of the mca_rotor field.
* This rotor points to the next metaslab group where allocations will be * This rotor points to the next metaslab group where allocations will be
* attempted. Allocating a block is a 3 step process -- select the metaslab * attempted. Allocating a block is a 3 step process -- select the metaslab
* group, select the metaslab, and then allocate the block. The metaslab * group, select the metaslab, and then allocate the block. The metaslab
@ -156,9 +179,7 @@ typedef enum trace_alloc_type {
struct metaslab_class { struct metaslab_class {
kmutex_t mc_lock; kmutex_t mc_lock;
spa_t *mc_spa; spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops; metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;
/* /*
* Track the number of metaslab groups that have been initialized * Track the number of metaslab groups that have been initialized
@ -173,21 +194,6 @@ struct metaslab_class {
*/ */
boolean_t mc_alloc_throttle_enabled; boolean_t mc_alloc_throttle_enabled;
/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mc_alloc_slots
* refcount. The mc_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
uint64_t *mc_alloc_max_slots;
zfs_refcount_t *mc_alloc_slots;
uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_alloc; /* total allocated space */ uint64_t mc_alloc; /* total allocated space */
@ -201,6 +207,8 @@ struct metaslab_class {
* recent use. * recent use.
*/ */
multilist_t *mc_metaslab_txg_list; multilist_t *mc_metaslab_txg_list;
metaslab_class_allocator_t mc_allocator[];
}; };
/* /*
@ -258,7 +266,7 @@ struct metaslab_group {
* *
* Each allocator in each metaslab group has a current queue depth * Each allocator in each metaslab group has a current queue depth
* (mg_alloc_queue_depth[allocator]) and a current max queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth
* (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group
* has an absolute max queue depth (mg_max_alloc_queue_depth). We * has an absolute max queue depth (mg_max_alloc_queue_depth). We
* add IOs to an allocator until the mg_alloc_queue_depth for that * add IOs to an allocator until the mg_alloc_queue_depth for that
* allocator hits the cur_max. Every time an IO completes for a given * allocator hits the cur_max. Every time an IO completes for a given
@ -271,8 +279,7 @@ struct metaslab_group {
* groups are unable to handle their share of allocations. * groups are unable to handle their share of allocations.
*/ */
uint64_t mg_max_alloc_queue_depth; uint64_t mg_max_alloc_queue_depth;
int mg_allocators;
metaslab_group_allocator_t *mg_allocator; /* array */
/* /*
* A metalab group that can no longer allocate the minimum block * A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out * size will set mg_no_free_space. Once a metaslab group is out
@ -290,6 +297,9 @@ struct metaslab_group {
boolean_t mg_disabled_updating; boolean_t mg_disabled_updating;
kmutex_t mg_ms_disabled_lock; kmutex_t mg_ms_disabled_lock;
kcondvar_t mg_ms_disabled_cv; kcondvar_t mg_ms_disabled_cv;
int mg_allocators;
metaslab_group_allocator_t mg_allocator[];
}; };
/* /*

View File

@ -396,20 +396,19 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
{ {
metaslab_class_t *mc; metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); mc = kmem_zalloc(offsetof(metaslab_class_t,
mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
mc->mc_spa = spa; mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops; mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * for (int i = 0; i < spa->spa_alloc_count; i++) {
sizeof (zfs_refcount_t), KM_SLEEP); metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * mca->mca_rotor = NULL;
sizeof (uint64_t), KM_SLEEP); zfs_refcount_create_tracked(&mca->mca_alloc_slots);
for (int i = 0; i < spa->spa_alloc_count; i++) }
zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
return (mc); return (mc);
} }
@ -417,21 +416,22 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
void void
metaslab_class_destroy(metaslab_class_t *mc) metaslab_class_destroy(metaslab_class_t *mc)
{ {
ASSERT(mc->mc_rotor == NULL); spa_t *spa = mc->mc_spa;
ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_alloc == 0);
ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_deferred == 0);
ASSERT(mc->mc_space == 0); ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0); ASSERT(mc->mc_dspace == 0);
for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) for (int i = 0; i < spa->spa_alloc_count; i++) {
zfs_refcount_destroy(&mc->mc_alloc_slots[i]); metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * ASSERT(mca->mca_rotor == NULL);
sizeof (zfs_refcount_t)); zfs_refcount_destroy(&mca->mca_alloc_slots);
kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * }
sizeof (uint64_t));
mutex_destroy(&mc->mc_lock); mutex_destroy(&mc->mc_lock);
multilist_destroy(mc->mc_metaslab_txg_list); multilist_destroy(mc->mc_metaslab_txg_list);
kmem_free(mc, sizeof (metaslab_class_t)); kmem_free(mc, offsetof(metaslab_class_t,
mc_allocator[spa->spa_alloc_count]));
} }
int int
@ -446,7 +446,7 @@ metaslab_class_validate(metaslab_class_t *mc)
ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
if ((mg = mc->mc_rotor) == NULL) if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
return (0); return (0);
do { do {
@ -455,7 +455,7 @@ metaslab_class_validate(metaslab_class_t *mc)
ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(vd->vdev_top, ==, vd);
ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(mg->mg_class, ==, mc);
ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
} while ((mg = mg->mg_next) != mc->mc_rotor); } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
return (0); return (0);
} }
@ -812,7 +812,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
{ {
metaslab_group_t *mg; metaslab_group_t *mg;
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mg = kmem_zalloc(offsetof(metaslab_group_t,
mg_allocator[allocators]), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
@ -825,8 +826,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mg->mg_no_free_space = B_TRUE; mg->mg_no_free_space = B_TRUE;
mg->mg_allocators = allocators; mg->mg_allocators = allocators;
mg->mg_allocator = kmem_zalloc(allocators *
sizeof (metaslab_group_allocator_t), KM_SLEEP);
for (int i = 0; i < allocators; i++) { for (int i = 0; i < allocators; i++) {
metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
@ -860,21 +859,19 @@ metaslab_group_destroy(metaslab_group_t *mg)
metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
zfs_refcount_destroy(&mga->mga_alloc_queue_depth); zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
} }
kmem_free(mg->mg_allocator, mg->mg_allocators * kmem_free(mg, offsetof(metaslab_group_t,
sizeof (metaslab_group_allocator_t)); mg_allocator[mg->mg_allocators]));
kmem_free(mg, sizeof (metaslab_group_t));
} }
void void
metaslab_group_activate(metaslab_group_t *mg) metaslab_group_activate(metaslab_group_t *mg)
{ {
metaslab_class_t *mc = mg->mg_class; metaslab_class_t *mc = mg->mg_class;
spa_t *spa = mc->mc_spa;
metaslab_group_t *mgprev, *mgnext; metaslab_group_t *mgprev, *mgnext;
ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
ASSERT(mc->mc_rotor != mg);
ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_next == NULL);
ASSERT(mg->mg_activation_count <= 0); ASSERT(mg->mg_activation_count <= 0);
@ -885,7 +882,7 @@ metaslab_group_activate(metaslab_group_t *mg)
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
metaslab_group_alloc_update(mg); metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_rotor) == NULL) { if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
mg->mg_prev = mg; mg->mg_prev = mg;
mg->mg_next = mg; mg->mg_next = mg;
} else { } else {
@ -895,7 +892,10 @@ metaslab_group_activate(metaslab_group_t *mg)
mgprev->mg_next = mg; mgprev->mg_next = mg;
mgnext->mg_prev = mg; mgnext->mg_prev = mg;
} }
mc->mc_rotor = mg; for (int i = 0; i < spa->spa_alloc_count; i++) {
mc->mc_allocator[i].mca_rotor = mg;
mg = mg->mg_next;
}
} }
/* /*
@ -916,7 +916,8 @@ metaslab_group_passivate(metaslab_group_t *mg)
(SCL_ALLOC | SCL_ZIO)); (SCL_ALLOC | SCL_ZIO));
if (--mg->mg_activation_count != 0) { if (--mg->mg_activation_count != 0) {
ASSERT(mc->mc_rotor != mg); for (int i = 0; i < spa->spa_alloc_count; i++)
ASSERT(mc->mc_allocator[i].mca_rotor != mg);
ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_next == NULL);
ASSERT(mg->mg_activation_count < 0); ASSERT(mg->mg_activation_count < 0);
@ -963,12 +964,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
mgnext = mg->mg_next; mgnext = mg->mg_next;
if (mg == mgnext) { if (mg == mgnext) {
mc->mc_rotor = NULL; mgnext = NULL;
} else { } else {
mc->mc_rotor = mgnext;
mgprev->mg_next = mgnext; mgprev->mg_next = mgnext;
mgnext->mg_prev = mgprev; mgnext->mg_prev = mgprev;
} }
for (int i = 0; i < spa->spa_alloc_count; i++) {
if (mc->mc_allocator[i].mca_rotor == mg)
mc->mc_allocator[i].mca_rotor = mgnext;
}
mg->mg_prev = NULL; mg->mg_prev = NULL;
mg->mg_next = NULL; mg->mg_next = NULL;
@ -1202,7 +1206,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* in metaslab_group_alloc_update() for more information) and * in metaslab_group_alloc_update() for more information) and
* the allocation throttle is disabled then allow allocations to this * the allocation throttle is disabled then allow allocations to this
* device. However, if the allocation throttle is enabled then * device. However, if the allocation throttle is enabled then
* check if we have reached our allocation limit (mg_alloc_queue_depth) * check if we have reached our allocation limit (mga_alloc_queue_depth)
* to determine if we should allow allocations to this metaslab group. * to determine if we should allow allocations to this metaslab group.
* If all metaslab groups are no longer considered allocatable * If all metaslab groups are no longer considered allocatable
* (mc_alloc_groups == 0) or we're trying to allocate the smallest * (mc_alloc_groups == 0) or we're trying to allocate the smallest
@ -4517,13 +4521,14 @@ static void
metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
{ {
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
metaslab_class_allocator_t *mca =
&mg->mg_class->mc_allocator[allocator];
uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t max = mg->mg_max_alloc_queue_depth;
uint64_t cur = mga->mga_cur_max_alloc_queue_depth; uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
while (cur < max) { while (cur < max) {
if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
cur, cur + 1) == cur) { cur, cur + 1) == cur) {
atomic_inc_64( atomic_inc_64(&mca->mca_alloc_max_slots);
&mg->mg_class->mc_alloc_max_slots[allocator]);
return; return;
} }
cur = mga->mga_cur_max_alloc_queue_depth; cur = mga->mga_cur_max_alloc_queue_depth;
@ -5059,6 +5064,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
zio_alloc_list_t *zal, int allocator) zio_alloc_list_t *zal, int allocator)
{ {
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
metaslab_group_t *mg, *fast_mg, *rotor; metaslab_group_t *mg, *fast_mg, *rotor;
vdev_t *vd; vdev_t *vd;
boolean_t try_hard = B_FALSE; boolean_t try_hard = B_FALSE;
@ -5080,7 +5086,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/* /*
* Start at the rotor and loop through all mgs until we find something. * Start at the rotor and loop through all mgs until we find something.
* Note that there's no locking on mc_rotor or mc_aliquot because * Note that there's no locking on mca_rotor or mca_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't * nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time. * allocate quite as evenly. It all balances out over time.
* *
@ -5116,23 +5122,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
mg->mg_next != NULL) mg->mg_next != NULL)
mg = mg->mg_next; mg = mg->mg_next;
} else { } else {
mg = mc->mc_rotor; mg = mca->mca_rotor;
} }
} else if (d != 0) { } else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next; mg = vd->vdev_mg->mg_next;
} else if (flags & METASLAB_FASTWRITE) { } else if (flags & METASLAB_FASTWRITE) {
mg = fast_mg = mc->mc_rotor; mg = fast_mg = mca->mca_rotor;
do { do {
if (fast_mg->mg_vd->vdev_pending_fastwrite < if (fast_mg->mg_vd->vdev_pending_fastwrite <
mg->mg_vd->vdev_pending_fastwrite) mg->mg_vd->vdev_pending_fastwrite)
mg = fast_mg; mg = fast_mg;
} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
} else { } else {
ASSERT(mc->mc_rotor != NULL); ASSERT(mca->mca_rotor != NULL);
mg = mc->mc_rotor; mg = mca->mca_rotor;
} }
/* /*
@ -5140,7 +5146,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* metaslab group that has been passivated, just follow the rotor. * metaslab group that has been passivated, just follow the rotor.
*/ */
if (mg->mg_class != mc || mg->mg_activation_count <= 0) if (mg->mg_class != mc || mg->mg_activation_count <= 0)
mg = mc->mc_rotor; mg = mca->mca_rotor;
rotor = mg; rotor = mg;
top: top:
@ -5218,7 +5224,7 @@ top:
* Bias is also used to compensate for unequally * Bias is also used to compensate for unequally
* sized vdevs so that space is allocated fairly. * sized vdevs so that space is allocated fairly.
*/ */
if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_t *vs = &vd->vdev_stat;
int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t vs_free = vs->vs_space - vs->vs_alloc;
int64_t mc_free = mc->mc_space - mc->mc_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc;
@ -5256,10 +5262,10 @@ top:
} }
if ((flags & METASLAB_FASTWRITE) || if ((flags & METASLAB_FASTWRITE) ||
atomic_add_64_nv(&mc->mc_aliquot, asize) >= atomic_add_64_nv(&mca->mca_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) { mg->mg_aliquot + mg->mg_bias) {
mc->mc_rotor = mg->mg_next; mca->mca_rotor = mg->mg_next;
mc->mc_aliquot = 0; mca->mca_aliquot = 0;
} }
DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_VDEV(&dva[d], vd->vdev_id);
@ -5276,8 +5282,8 @@ top:
return (0); return (0);
} }
next: next:
mc->mc_rotor = mg->mg_next; mca->mca_rotor = mg->mg_next;
mc->mc_aliquot = 0; mca->mca_aliquot = 0;
} while ((mg = mg->mg_next) != rotor); } while ((mg = mg->mg_next) != rotor);
/* /*
@ -5595,15 +5601,15 @@ boolean_t
metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
zio_t *zio, int flags) zio_t *zio, int flags)
{ {
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
uint64_t available_slots = 0; uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE; boolean_t slot_reserved = B_FALSE;
uint64_t max = mc->mc_alloc_max_slots[allocator]; uint64_t max = mca->mca_alloc_max_slots;
ASSERT(mc->mc_alloc_throttle_enabled); ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock); mutex_enter(&mc->mc_lock);
uint64_t reserved_slots = uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
if (reserved_slots < max) if (reserved_slots < max)
available_slots = max - reserved_slots; available_slots = max - reserved_slots;
@ -5613,11 +5619,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
* We reserve the slots individually so that we can unreserve * We reserve the slots individually so that we can unreserve
* them individually when an I/O completes. * them individually when an I/O completes.
*/ */
for (int d = 0; d < slots; d++) { for (int d = 0; d < slots; d++)
reserved_slots = zfs_refcount_add(&mca->mca_alloc_slots, zio);
zfs_refcount_add(&mc->mc_alloc_slots[allocator],
zio);
}
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE; slot_reserved = B_TRUE;
} }
@ -5630,12 +5633,12 @@ void
metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
int allocator, zio_t *zio) int allocator, zio_t *zio)
{ {
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled); ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock); mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++) { for (int d = 0; d < slots; d++)
(void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], zfs_refcount_remove(&mca->mca_alloc_slots, zio);
zio);
}
mutex_exit(&mc->mc_lock); mutex_exit(&mc->mc_lock);
} }
@ -5789,7 +5792,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
if (mc->mc_rotor == NULL) { /* no vdevs in this class */ if (mc->mc_allocator[allocator].mca_rotor == NULL) {
/* no vdevs in this class */
spa_config_exit(spa, SCL_ALLOC, FTAG); spa_config_exit(spa, SCL_ALLOC, FTAG);
return (SET_ERROR(ENOSPC)); return (SET_ERROR(ENOSPC));
} }

View File

@ -2111,9 +2111,6 @@ spa_passivate_log(spa_t *spa)
ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
if (!spa_has_slogs(spa))
return (B_FALSE);
for (int c = 0; c < rvd->vdev_children; c++) { for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c]; vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg; metaslab_group_t *mg = tvd->vdev_mg;
@ -8883,12 +8880,18 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
} }
for (int i = 0; i < spa->spa_alloc_count; i++) { for (int i = 0; i < spa->spa_alloc_count; i++) {
ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); mca_alloc_slots));
ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
normal->mc_alloc_max_slots[i] = slots_per_allocator; mca_alloc_slots));
special->mc_alloc_max_slots[i] = slots_per_allocator; ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
dedup->mc_alloc_max_slots[i] = slots_per_allocator; mca_alloc_slots));
normal->mc_allocator[i].mca_alloc_max_slots =
slots_per_allocator;
special->mc_allocator[i].mca_alloc_max_slots =
slots_per_allocator;
dedup->mc_allocator[i].mca_alloc_max_slots =
slots_per_allocator;
} }
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;

View File

@ -2436,7 +2436,7 @@ spa_fini(void)
boolean_t boolean_t
spa_has_slogs(spa_t *spa) spa_has_slogs(spa_t *spa)
{ {
return (spa->spa_log_class->mc_rotor != NULL); return (spa->spa_log_class->mc_groups != 0);
} }
spa_log_state_t spa_log_state_t

View File

@ -2783,8 +2783,8 @@ zio_write_gang_block(zio_t *pio)
ASSERT(has_data); ASSERT(has_data);
flags |= METASLAB_ASYNC_ALLOC; flags |= METASLAB_ASYNC_ALLOC;
VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
pio)); mca_alloc_slots, pio));
/* /*
* The logical zio has already placed a reservation for * The logical zio has already placed a reservation for
@ -4468,9 +4468,8 @@ zio_done(zio_t *zio)
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator); zio->io_allocator);
VERIFY(zfs_refcount_not_held( VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
&zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
zio));
} }