mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-11-17 10:01:01 +03:00
Bypass metaslab throttle for removal allocations
Context: We recently had a scenario where a customer with 2x10TB disks at 95+% fragmentation and capacity, wanted to migrate their disks to a 2x20TB setup. So they added the 2 new disks and submitted the removal of the first 10TB disk. The removal took a lot more than expected (order of more than a week to 2 weeks vs a couple of days) and once it was done it generated a huge indirect mappign table in RAM (~16GB vs expected ~1GB). Root-Cause: The removal code calls `metaslab_alloc_dva()` to allocate a new block for each evacuating block in the removing device and it tries to batch them into 16MB segments. If it can't find such a segment it tries for 8MBs, 4MBs, all the way down to 512 bytes. In our scenario what would happen is that `metaslab_alloc_dva()` from the removal thread pick the new devices initially but wouldn't allocate from them because of throttling in their metaslab allocation queue's depth (see `metaslab_group_allocatable()`) as these devices are new and favored for most types of allocations because of their free space. So then the removal thread would look at the old fragmented disk for allocations and wouldn't find any contiguous space and finally retry with a smaller allocation size until it would to the low KB range. This caused a lot of small mappings to be generated blowing up the size of the indirect table. It also wasted a lot of CPU while the removal was active making everything slow. This patch: Make all allocations coming from the device removal thread bypass the throttle checks. These allocations are not even counted in the metaslab allocation queues anyway so why check them? Side-Fix: Allocations with METASLAB_DONT_THROTTLE in their flags would not be accounted at the throttle queues but they'd still abide by the throttling rules which seems wrong. This patch fixes this by checking for that flag in `metaslab_group_allocatable()`. I did a quick check to see where else this flag is used and it doesn't seem like this change would cause issues. Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com> Closes #14159
This commit is contained in:
parent
5f73bbba43
commit
7bf4c97a36
@ -1223,7 +1223,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
|
|||||||
*/
|
*/
|
||||||
static boolean_t
|
static boolean_t
|
||||||
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
||||||
uint64_t psize, int allocator, int d)
|
int flags, uint64_t psize, int allocator, int d)
|
||||||
{
|
{
|
||||||
spa_t *spa = mg->mg_vd->vdev_spa;
|
spa_t *spa = mg->mg_vd->vdev_spa;
|
||||||
metaslab_class_t *mc = mg->mg_class;
|
metaslab_class_t *mc = mg->mg_class;
|
||||||
@ -1267,6 +1267,15 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
|||||||
if (mg->mg_no_free_space)
|
if (mg->mg_no_free_space)
|
||||||
return (B_FALSE);
|
return (B_FALSE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Some allocations (e.g., those coming from device removal
|
||||||
|
* where the * allocations are not even counted in the
|
||||||
|
* metaslab * allocation queues) are allowed to bypass
|
||||||
|
* the throttle.
|
||||||
|
*/
|
||||||
|
if (flags & METASLAB_DONT_THROTTLE)
|
||||||
|
return (B_TRUE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Relax allocation throttling for ditto blocks. Due to
|
* Relax allocation throttling for ditto blocks. Due to
|
||||||
* random imbalances in allocation it tends to push copies
|
* random imbalances in allocation it tends to push copies
|
||||||
@ -5188,7 +5197,7 @@ top:
|
|||||||
*/
|
*/
|
||||||
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
|
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
|
||||||
allocatable = metaslab_group_allocatable(mg, rotor,
|
allocatable = metaslab_group_allocatable(mg, rotor,
|
||||||
psize, allocator, d);
|
flags, psize, allocator, d);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!allocatable) {
|
if (!allocatable) {
|
||||||
|
@ -1168,11 +1168,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
|
|||||||
metaslab_class_t *mc = mg->mg_class;
|
metaslab_class_t *mc = mg->mg_class;
|
||||||
if (mc->mc_groups == 0)
|
if (mc->mc_groups == 0)
|
||||||
mc = spa_normal_class(spa);
|
mc = spa_normal_class(spa);
|
||||||
int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
|
int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
|
||||||
zal, 0);
|
METASLAB_DONT_THROTTLE, zal, 0);
|
||||||
if (error == ENOSPC && mc != spa_normal_class(spa)) {
|
if (error == ENOSPC && mc != spa_normal_class(spa)) {
|
||||||
error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
|
error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
|
||||||
&dst, 0, NULL, txg, 0, zal, 0);
|
&dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0);
|
||||||
}
|
}
|
||||||
if (error != 0)
|
if (error != 0)
|
||||||
return (error);
|
return (error);
|
||||||
|
Loading…
Reference in New Issue
Block a user