mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-11-18 02:20:59 +03:00
Revert "Reduce latency effects of non-interactive I/O"
Under certain conditions commit a3a4b8def
appears to result in a
hang, or poor performance, when importing a pool. Until the root
cause can be identified it has been reverted from the release branch.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #11245
This commit is contained in:
parent
a4ab0c607e
commit
2c36eb763f
@ -148,9 +148,6 @@ struct vdev_queue {
|
|||||||
avl_tree_t vq_write_offset_tree;
|
avl_tree_t vq_write_offset_tree;
|
||||||
avl_tree_t vq_trim_offset_tree;
|
avl_tree_t vq_trim_offset_tree;
|
||||||
uint64_t vq_last_offset;
|
uint64_t vq_last_offset;
|
||||||
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
|
|
||||||
uint32_t vq_ia_active; /* Active interactive I/Os. */
|
|
||||||
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
|
|
||||||
hrtime_t vq_io_complete_ts; /* time last i/o completed */
|
hrtime_t vq_io_complete_ts; /* time last i/o completed */
|
||||||
hrtime_t vq_io_delta_ts;
|
hrtime_t vq_io_delta_ts;
|
||||||
zio_t vq_io_search; /* used as local for stack reduction */
|
zio_t vq_io_search; /* used as local for stack reduction */
|
||||||
|
@ -2011,7 +2011,8 @@ Default value: \fB1\fR.
|
|||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
The maximum number of I/Os active to each device. Ideally, this will be >=
|
The maximum number of I/Os active to each device. Ideally, this will be >=
|
||||||
the sum of each queue's max_active. See the section "ZFS I/O SCHEDULER".
|
the sum of each queue's max_active. It must be at least the sum of each
|
||||||
|
queue's min_active. See the section "ZFS I/O SCHEDULER".
|
||||||
.sp
|
.sp
|
||||||
Default value: \fB1,000\fR.
|
Default value: \fB1,000\fR.
|
||||||
.RE
|
.RE
|
||||||
@ -2160,42 +2161,6 @@ See the section "ZFS I/O SCHEDULER".
|
|||||||
Default value: \fB1\fR.
|
Default value: \fB1\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
.sp
|
|
||||||
.ne 2
|
|
||||||
.na
|
|
||||||
\fBzfs_vdev_nia_delay\fR (int)
|
|
||||||
.ad
|
|
||||||
.RS 12n
|
|
||||||
For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
|
|
||||||
the number of concurrently-active I/O's is limited to *_min_active, unless
|
|
||||||
the vdev is "idle". When there are no interactive I/Os active (sync or
|
|
||||||
async), and zfs_vdev_nia_delay I/Os have completed since the last
|
|
||||||
interactive I/O, then the vdev is considered to be "idle", and the number
|
|
||||||
of concurrently-active non-interactive I/O's is increased to *_max_active.
|
|
||||||
See the section "ZFS I/O SCHEDULER".
|
|
||||||
.sp
|
|
||||||
Default value: \fB5\fR.
|
|
||||||
.RE
|
|
||||||
|
|
||||||
.sp
|
|
||||||
.ne 2
|
|
||||||
.na
|
|
||||||
\fBzfs_vdev_nia_credit\fR (int)
|
|
||||||
.ad
|
|
||||||
.RS 12n
|
|
||||||
Some HDDs tend to prioritize sequential I/O so high, that concurrent
|
|
||||||
random I/O latency reaches several seconds. On some HDDs it happens
|
|
||||||
even if sequential I/Os are submitted one at a time, and so setting
|
|
||||||
*_max_active to 1 does not help. To prevent non-interactive I/Os, like
|
|
||||||
scrub, from monopolizing the device no more than zfs_vdev_nia_credit
|
|
||||||
I/Os can be sent while there are outstanding incomplete interactive
|
|
||||||
I/Os. This enforced wait ensures the HDD services the interactive I/O
|
|
||||||
within a reasonable amount of time.
|
|
||||||
See the section "ZFS I/O SCHEDULER".
|
|
||||||
.sp
|
|
||||||
Default value: \fB5\fR.
|
|
||||||
.RE
|
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
|
@ -121,17 +121,16 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* The maximum number of i/os active to each device. Ideally, this will be >=
|
* The maximum number of i/os active to each device. Ideally, this will be >=
|
||||||
* the sum of each queue's max_active.
|
* the sum of each queue's max_active. It must be at least the sum of each
|
||||||
|
* queue's min_active.
|
||||||
*/
|
*/
|
||||||
uint32_t zfs_vdev_max_active = 1000;
|
uint32_t zfs_vdev_max_active = 1000;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Per-queue limits on the number of i/os active to each device. If the
|
* Per-queue limits on the number of i/os active to each device. If the
|
||||||
* number of active i/os is < zfs_vdev_max_active, then the min_active comes
|
* number of active i/os is < zfs_vdev_max_active, then the min_active comes
|
||||||
* into play. We will send min_active from each queue round-robin, and then
|
* into play. We will send min_active from each queue, and then select from
|
||||||
* send from queues in the order defined by zio_priority_t up to max_active.
|
* queues in the order defined by zio_priority_t.
|
||||||
* Some queues have additional mechanisms to limit number of active I/Os in
|
|
||||||
* addition to min_active and max_active, see below.
|
|
||||||
*
|
*
|
||||||
* In general, smaller max_active's will lead to lower latency of synchronous
|
* In general, smaller max_active's will lead to lower latency of synchronous
|
||||||
* operations. Larger max_active's may lead to higher overall throughput,
|
* operations. Larger max_active's may lead to higher overall throughput,
|
||||||
@ -152,7 +151,7 @@ uint32_t zfs_vdev_async_read_max_active = 3;
|
|||||||
uint32_t zfs_vdev_async_write_min_active = 2;
|
uint32_t zfs_vdev_async_write_min_active = 2;
|
||||||
uint32_t zfs_vdev_async_write_max_active = 10;
|
uint32_t zfs_vdev_async_write_max_active = 10;
|
||||||
uint32_t zfs_vdev_scrub_min_active = 1;
|
uint32_t zfs_vdev_scrub_min_active = 1;
|
||||||
uint32_t zfs_vdev_scrub_max_active = 3;
|
uint32_t zfs_vdev_scrub_max_active = 2;
|
||||||
uint32_t zfs_vdev_removal_min_active = 1;
|
uint32_t zfs_vdev_removal_min_active = 1;
|
||||||
uint32_t zfs_vdev_removal_max_active = 2;
|
uint32_t zfs_vdev_removal_max_active = 2;
|
||||||
uint32_t zfs_vdev_initializing_min_active = 1;
|
uint32_t zfs_vdev_initializing_min_active = 1;
|
||||||
@ -172,28 +171,6 @@ uint32_t zfs_vdev_rebuild_max_active = 3;
|
|||||||
int zfs_vdev_async_write_active_min_dirty_percent = 30;
|
int zfs_vdev_async_write_active_min_dirty_percent = 30;
|
||||||
int zfs_vdev_async_write_active_max_dirty_percent = 60;
|
int zfs_vdev_async_write_active_max_dirty_percent = 60;
|
||||||
|
|
||||||
/*
|
|
||||||
* For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
|
|
||||||
* the number of concurrently-active I/O's is limited to *_min_active, unless
|
|
||||||
* the vdev is "idle". When there are no interactive I/Os active (sync or
|
|
||||||
* async), and zfs_vdev_nia_delay I/Os have completed since the last
|
|
||||||
* interactive I/O, then the vdev is considered to be "idle", and the number
|
|
||||||
* of concurrently-active non-interactive I/O's is increased to *_max_active.
|
|
||||||
*/
|
|
||||||
uint_t zfs_vdev_nia_delay = 5;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Some HDDs tend to prioritize sequential I/O so high that concurrent
|
|
||||||
* random I/O latency reaches several seconds. On some HDDs it happens
|
|
||||||
* even if sequential I/Os are submitted one at a time, and so setting
|
|
||||||
* *_max_active to 1 does not help. To prevent non-interactive I/Os, like
|
|
||||||
* scrub, from monopolizing the device no more than zfs_vdev_nia_credit
|
|
||||||
* I/Os can be sent while there are outstanding incomplete interactive
|
|
||||||
* I/Os. This enforced wait ensures the HDD services the interactive I/O
|
|
||||||
* within a reasonable amount of time.
|
|
||||||
*/
|
|
||||||
uint_t zfs_vdev_nia_credit = 5;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
|
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
|
||||||
* For read I/Os, we also aggregate across small adjacency gaps; for writes
|
* For read I/Os, we also aggregate across small adjacency gaps; for writes
|
||||||
@ -284,7 +261,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
|
vdev_queue_class_min_active(zio_priority_t p)
|
||||||
{
|
{
|
||||||
switch (p) {
|
switch (p) {
|
||||||
case ZIO_PRIORITY_SYNC_READ:
|
case ZIO_PRIORITY_SYNC_READ:
|
||||||
@ -296,19 +273,15 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
|
|||||||
case ZIO_PRIORITY_ASYNC_WRITE:
|
case ZIO_PRIORITY_ASYNC_WRITE:
|
||||||
return (zfs_vdev_async_write_min_active);
|
return (zfs_vdev_async_write_min_active);
|
||||||
case ZIO_PRIORITY_SCRUB:
|
case ZIO_PRIORITY_SCRUB:
|
||||||
return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
|
return (zfs_vdev_scrub_min_active);
|
||||||
MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
|
|
||||||
case ZIO_PRIORITY_REMOVAL:
|
case ZIO_PRIORITY_REMOVAL:
|
||||||
return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
|
return (zfs_vdev_removal_min_active);
|
||||||
MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
|
|
||||||
case ZIO_PRIORITY_INITIALIZING:
|
case ZIO_PRIORITY_INITIALIZING:
|
||||||
return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
|
return (zfs_vdev_initializing_min_active);
|
||||||
MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
|
|
||||||
case ZIO_PRIORITY_TRIM:
|
case ZIO_PRIORITY_TRIM:
|
||||||
return (zfs_vdev_trim_min_active);
|
return (zfs_vdev_trim_min_active);
|
||||||
case ZIO_PRIORITY_REBUILD:
|
case ZIO_PRIORITY_REBUILD:
|
||||||
return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
|
return (zfs_vdev_rebuild_min_active);
|
||||||
MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
|
|
||||||
default:
|
default:
|
||||||
panic("invalid priority %u", p);
|
panic("invalid priority %u", p);
|
||||||
return (0);
|
return (0);
|
||||||
@ -364,7 +337,7 @@ vdev_queue_max_async_writes(spa_t *spa)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
|
vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
|
||||||
{
|
{
|
||||||
switch (p) {
|
switch (p) {
|
||||||
case ZIO_PRIORITY_SYNC_READ:
|
case ZIO_PRIORITY_SYNC_READ:
|
||||||
@ -376,34 +349,14 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
|
|||||||
case ZIO_PRIORITY_ASYNC_WRITE:
|
case ZIO_PRIORITY_ASYNC_WRITE:
|
||||||
return (vdev_queue_max_async_writes(spa));
|
return (vdev_queue_max_async_writes(spa));
|
||||||
case ZIO_PRIORITY_SCRUB:
|
case ZIO_PRIORITY_SCRUB:
|
||||||
if (vq->vq_ia_active > 0) {
|
|
||||||
return (MIN(vq->vq_nia_credit,
|
|
||||||
zfs_vdev_scrub_min_active));
|
|
||||||
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
|
|
||||||
return (zfs_vdev_scrub_min_active);
|
|
||||||
return (zfs_vdev_scrub_max_active);
|
return (zfs_vdev_scrub_max_active);
|
||||||
case ZIO_PRIORITY_REMOVAL:
|
case ZIO_PRIORITY_REMOVAL:
|
||||||
if (vq->vq_ia_active > 0) {
|
|
||||||
return (MIN(vq->vq_nia_credit,
|
|
||||||
zfs_vdev_removal_min_active));
|
|
||||||
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
|
|
||||||
return (zfs_vdev_removal_min_active);
|
|
||||||
return (zfs_vdev_removal_max_active);
|
return (zfs_vdev_removal_max_active);
|
||||||
case ZIO_PRIORITY_INITIALIZING:
|
case ZIO_PRIORITY_INITIALIZING:
|
||||||
if (vq->vq_ia_active > 0) {
|
|
||||||
return (MIN(vq->vq_nia_credit,
|
|
||||||
zfs_vdev_initializing_min_active));
|
|
||||||
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
|
|
||||||
return (zfs_vdev_initializing_min_active);
|
|
||||||
return (zfs_vdev_initializing_max_active);
|
return (zfs_vdev_initializing_max_active);
|
||||||
case ZIO_PRIORITY_TRIM:
|
case ZIO_PRIORITY_TRIM:
|
||||||
return (zfs_vdev_trim_max_active);
|
return (zfs_vdev_trim_max_active);
|
||||||
case ZIO_PRIORITY_REBUILD:
|
case ZIO_PRIORITY_REBUILD:
|
||||||
if (vq->vq_ia_active > 0) {
|
|
||||||
return (MIN(vq->vq_nia_credit,
|
|
||||||
zfs_vdev_rebuild_min_active));
|
|
||||||
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
|
|
||||||
return (zfs_vdev_rebuild_min_active);
|
|
||||||
return (zfs_vdev_rebuild_max_active);
|
return (zfs_vdev_rebuild_max_active);
|
||||||
default:
|
default:
|
||||||
panic("invalid priority %u", p);
|
panic("invalid priority %u", p);
|
||||||
@ -419,25 +372,18 @@ static zio_priority_t
|
|||||||
vdev_queue_class_to_issue(vdev_queue_t *vq)
|
vdev_queue_class_to_issue(vdev_queue_t *vq)
|
||||||
{
|
{
|
||||||
spa_t *spa = vq->vq_vdev->vdev_spa;
|
spa_t *spa = vq->vq_vdev->vdev_spa;
|
||||||
zio_priority_t p, n;
|
zio_priority_t p;
|
||||||
|
|
||||||
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
|
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
|
||||||
return (ZIO_PRIORITY_NUM_QUEUEABLE);
|
return (ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||||
|
|
||||||
/*
|
/* find a queue that has not reached its minimum # outstanding i/os */
|
||||||
* Find a queue that has not reached its minimum # outstanding i/os.
|
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||||
* Do round-robin to reduce starvation due to zfs_vdev_max_active
|
|
||||||
* and vq_nia_credit limits.
|
|
||||||
*/
|
|
||||||
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
|
|
||||||
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
|
|
||||||
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
||||||
vq->vq_class[p].vqc_active <
|
vq->vq_class[p].vqc_active <
|
||||||
vdev_queue_class_min_active(vq, p)) {
|
vdev_queue_class_min_active(p))
|
||||||
vq->vq_last_prio = p;
|
|
||||||
return (p);
|
return (p);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we haven't found a queue, look for one that hasn't reached its
|
* If we haven't found a queue, look for one that hasn't reached its
|
||||||
@ -446,11 +392,9 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
|
|||||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||||
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
||||||
vq->vq_class[p].vqc_active <
|
vq->vq_class[p].vqc_active <
|
||||||
vdev_queue_class_max_active(spa, vq, p)) {
|
vdev_queue_class_max_active(spa, p))
|
||||||
vq->vq_last_prio = p;
|
|
||||||
return (p);
|
return (p);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* No eligible queued i/os */
|
/* No eligible queued i/os */
|
||||||
return (ZIO_PRIORITY_NUM_QUEUEABLE);
|
return (ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||||
@ -549,20 +493,6 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static boolean_t
|
|
||||||
vdev_queue_is_interactive(zio_priority_t p)
|
|
||||||
{
|
|
||||||
switch (p) {
|
|
||||||
case ZIO_PRIORITY_SCRUB:
|
|
||||||
case ZIO_PRIORITY_REMOVAL:
|
|
||||||
case ZIO_PRIORITY_INITIALIZING:
|
|
||||||
case ZIO_PRIORITY_REBUILD:
|
|
||||||
return (B_FALSE);
|
|
||||||
default:
|
|
||||||
return (B_TRUE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
|
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
|
||||||
{
|
{
|
||||||
@ -572,12 +502,6 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
|
|||||||
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
||||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||||
vq->vq_class[zio->io_priority].vqc_active++;
|
vq->vq_class[zio->io_priority].vqc_active++;
|
||||||
if (vdev_queue_is_interactive(zio->io_priority)) {
|
|
||||||
if (++vq->vq_ia_active == 1)
|
|
||||||
vq->vq_nia_credit = 1;
|
|
||||||
} else if (vq->vq_ia_active > 0) {
|
|
||||||
vq->vq_nia_credit--;
|
|
||||||
}
|
|
||||||
avl_add(&vq->vq_active_tree, zio);
|
avl_add(&vq->vq_active_tree, zio);
|
||||||
|
|
||||||
if (shk->kstat != NULL) {
|
if (shk->kstat != NULL) {
|
||||||
@ -596,13 +520,6 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
|
|||||||
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
||||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||||
vq->vq_class[zio->io_priority].vqc_active--;
|
vq->vq_class[zio->io_priority].vqc_active--;
|
||||||
if (vdev_queue_is_interactive(zio->io_priority)) {
|
|
||||||
if (--vq->vq_ia_active == 0)
|
|
||||||
vq->vq_nia_credit = 0;
|
|
||||||
else
|
|
||||||
vq->vq_nia_credit = zfs_vdev_nia_credit;
|
|
||||||
} else if (vq->vq_ia_active == 0)
|
|
||||||
vq->vq_nia_credit++;
|
|
||||||
avl_remove(&vq->vq_active_tree, zio);
|
avl_remove(&vq->vq_active_tree, zio);
|
||||||
|
|
||||||
if (shk->kstat != NULL) {
|
if (shk->kstat != NULL) {
|
||||||
@ -1148,12 +1065,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
|
|||||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
|
||||||
"Min active rebuild I/Os per vdev");
|
"Min active rebuild I/Os per vdev");
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
|
|
||||||
"Number of non-interactive I/Os to allow in sequence");
|
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
|
|
||||||
"Number of non-interactive I/Os before _max_active");
|
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
|
||||||
"Queue depth percentage for each top-level vdev");
|
"Queue depth percentage for each top-level vdev");
|
||||||
/* END CSTYLED */
|
/* END CSTYLED */
|
||||||
|
Loading…
Reference in New Issue
Block a user