mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-13 11:40:25 +03:00
Illumos 4754, 4755
4754 io issued to near-full luns even after setting noalloc threshold 4755 mg_alloc_failures is no longer needed Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/4754 https://www.illumos.org/issues/4755 https://github.com/illumos/illumos-gate/commit/b6240e8 Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2533
This commit is contained in:
parent
9bd274ddd8
commit
672692c7b7
@ -24,7 +24,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_METASLAB_IMPL_H
|
#ifndef _SYS_METASLAB_IMPL_H
|
||||||
@ -58,7 +58,6 @@ struct metaslab_group {
|
|||||||
kmutex_t mg_lock;
|
kmutex_t mg_lock;
|
||||||
avl_tree_t mg_metaslab_tree;
|
avl_tree_t mg_metaslab_tree;
|
||||||
uint64_t mg_aliquot;
|
uint64_t mg_aliquot;
|
||||||
uint64_t mg_alloc_failures;
|
|
||||||
boolean_t mg_allocatable; /* can we allocate? */
|
boolean_t mg_allocatable; /* can we allocate? */
|
||||||
uint64_t mg_free_capacity; /* percentage free */
|
uint64_t mg_free_capacity; /* percentage free */
|
||||||
int64_t mg_bias;
|
int64_t mg_bias;
|
||||||
|
@ -40,7 +40,7 @@
|
|||||||
* avoid having to load lots of space_maps in a given txg. There are,
|
* avoid having to load lots of space_maps in a given txg. There are,
|
||||||
* however, some cases where we want to avoid "fast" ganging and instead
|
* however, some cases where we want to avoid "fast" ganging and instead
|
||||||
* we want to do an exhaustive search of all metaslabs on this device.
|
* we want to do an exhaustive search of all metaslabs on this device.
|
||||||
* Currently we don't allow any gang, zil, or dump device related allocations
|
* Currently we don't allow any gang, slog, or dump device related allocations
|
||||||
* to "fast" gang.
|
* to "fast" gang.
|
||||||
*/
|
*/
|
||||||
#define CAN_FASTGANG(flags) \
|
#define CAN_FASTGANG(flags) \
|
||||||
@ -63,14 +63,6 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
|
|||||||
*/
|
*/
|
||||||
int zfs_condense_pct = 200;
|
int zfs_condense_pct = 200;
|
||||||
|
|
||||||
/*
|
|
||||||
* This value defines the number of allowed allocation failures per vdev.
|
|
||||||
* If a device reaches this threshold in a given txg then we consider skipping
|
|
||||||
* allocations on that device. The value of zfs_mg_alloc_failures is computed
|
|
||||||
* in zio_init() unless it has been overridden in /etc/system.
|
|
||||||
*/
|
|
||||||
int zfs_mg_alloc_failures = 0;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The zfs_mg_noalloc_threshold defines which metaslab groups should
|
* The zfs_mg_noalloc_threshold defines which metaslab groups should
|
||||||
* be eligible for allocation. The value is defined as a percentage of
|
* be eligible for allocation. The value is defined as a percentage of
|
||||||
@ -1660,10 +1652,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
|||||||
void
|
void
|
||||||
metaslab_sync_reassess(metaslab_group_t *mg)
|
metaslab_sync_reassess(metaslab_group_t *mg)
|
||||||
{
|
{
|
||||||
int64_t failures = mg->mg_alloc_failures;
|
|
||||||
|
|
||||||
metaslab_group_alloc_update(mg);
|
metaslab_group_alloc_update(mg);
|
||||||
atomic_add_64(&mg->mg_alloc_failures, -failures);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Preload the next potential metaslabs
|
* Preload the next potential metaslabs
|
||||||
@ -1690,7 +1679,7 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
|
|||||||
|
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
||||||
uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
|
uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
|
||||||
{
|
{
|
||||||
spa_t *spa = mg->mg_vd->vdev_spa;
|
spa_t *spa = mg->mg_vd->vdev_spa;
|
||||||
metaslab_t *msp = NULL;
|
metaslab_t *msp = NULL;
|
||||||
@ -1717,10 +1706,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
|||||||
spa_dbgmsg(spa, "%s: failed to meet weight "
|
spa_dbgmsg(spa, "%s: failed to meet weight "
|
||||||
"requirement: vdev %llu, txg %llu, mg %p, "
|
"requirement: vdev %llu, txg %llu, mg %p, "
|
||||||
"msp %p, psize %llu, asize %llu, "
|
"msp %p, psize %llu, asize %llu, "
|
||||||
"failures %llu, weight %llu",
|
"weight %llu", spa_name(spa),
|
||||||
spa_name(spa), mg->mg_vd->vdev_id, txg,
|
mg->mg_vd->vdev_id, txg,
|
||||||
mg, msp, psize, asize,
|
mg, msp, psize, asize, msp->ms_weight);
|
||||||
mg->mg_alloc_failures, msp->ms_weight);
|
|
||||||
mutex_exit(&mg->mg_lock);
|
mutex_exit(&mg->mg_lock);
|
||||||
return (-1ULL);
|
return (-1ULL);
|
||||||
}
|
}
|
||||||
@ -1752,27 +1740,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
|||||||
|
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
|
|
||||||
/*
|
|
||||||
* If we've already reached the allowable number of failed
|
|
||||||
* allocation attempts on this metaslab group then we
|
|
||||||
* consider skipping it. We skip it only if we're allowed
|
|
||||||
* to "fast" gang, the physical size is larger than
|
|
||||||
* a gang block, and we're attempting to allocate from
|
|
||||||
* the primary metaslab.
|
|
||||||
*/
|
|
||||||
if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
|
|
||||||
CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
|
|
||||||
activation_weight == METASLAB_WEIGHT_PRIMARY) {
|
|
||||||
spa_dbgmsg(spa, "%s: skipping metaslab group: "
|
|
||||||
"vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
|
|
||||||
"psize %llu, asize %llu, failures %llu",
|
|
||||||
spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
|
|
||||||
msp->ms_id, msp, psize, asize,
|
|
||||||
mg->mg_alloc_failures);
|
|
||||||
mutex_exit(&msp->ms_lock);
|
|
||||||
return (-1ULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ensure that the metaslab we have selected is still
|
* Ensure that the metaslab we have selected is still
|
||||||
* capable of handling our request. It's possible that
|
* capable of handling our request. It's possible that
|
||||||
@ -1812,8 +1779,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
|
|||||||
if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
|
if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
atomic_inc_64(&mg->mg_alloc_failures);
|
|
||||||
|
|
||||||
metaslab_passivate(msp, metaslab_block_maxsize(msp));
|
metaslab_passivate(msp, metaslab_block_maxsize(msp));
|
||||||
mutex_exit(&msp->ms_lock);
|
mutex_exit(&msp->ms_lock);
|
||||||
}
|
}
|
||||||
@ -1980,7 +1945,7 @@ top:
|
|||||||
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
|
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
|
||||||
|
|
||||||
offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
|
offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
|
||||||
dva, d, flags);
|
dva, d);
|
||||||
if (offset != -1ULL) {
|
if (offset != -1ULL) {
|
||||||
/*
|
/*
|
||||||
* If we've just selected this metaslab group,
|
* If we've just selected this metaslab group,
|
||||||
|
@ -60,8 +60,6 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|||||||
int zio_bulk_flags = 0;
|
int zio_bulk_flags = 0;
|
||||||
int zio_delay_max = ZIO_DELAY_MAX;
|
int zio_delay_max = ZIO_DELAY_MAX;
|
||||||
|
|
||||||
extern int zfs_mg_alloc_failures;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The following actions directly effect the spa's sync-to-convergence logic.
|
* The following actions directly effect the spa's sync-to-convergence logic.
|
||||||
* The values below define the sync pass when we start performing the action.
|
* The values below define the sync pass when we start performing the action.
|
||||||
@ -193,13 +191,6 @@ zio_init(void)
|
|||||||
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
|
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
|
|
||||||
* to fail 3 times per txg or 8 failures, whichever is greater.
|
|
||||||
*/
|
|
||||||
if (zfs_mg_alloc_failures == 0)
|
|
||||||
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
|
|
||||||
|
|
||||||
zio_inject_init();
|
zio_inject_init();
|
||||||
|
|
||||||
lz4_init();
|
lz4_init();
|
||||||
|
Loading…
Reference in New Issue
Block a user