Revert "Reduce latency effects of non-interactive I/O"

Under certain conditions commit a3a4b8def appears to result in a
hang, or poor performance, when importing a pool.  Until the root
cause can be identified it has been reverted from the release branch.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #11245
This commit is contained in:
Brian Behlendorf 2020-11-30 09:38:15 -08:00
parent a4ab0c607e
commit 2c36eb763f
3 changed files with 18 additions and 145 deletions

View File

@ -148,9 +148,6 @@ struct vdev_queue {
avl_tree_t vq_write_offset_tree; avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree; avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset; uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts; hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */ zio_t vq_io_search; /* used as local for stack reduction */

View File

@ -2011,7 +2011,8 @@ Default value: \fB1\fR.
.ad .ad
.RS 12n .RS 12n
The maximum number of I/Os active to each device. Ideally, this will be >= The maximum number of I/Os active to each device. Ideally, this will be >=
the sum of each queue's max_active. See the section "ZFS I/O SCHEDULER". the sum of each queue's max_active. It must be at least the sum of each
queue's min_active. See the section "ZFS I/O SCHEDULER".
.sp .sp
Default value: \fB1,000\fR. Default value: \fB1,000\fR.
.RE .RE
@ -2160,42 +2161,6 @@ See the section "ZFS I/O SCHEDULER".
Default value: \fB1\fR. Default value: \fB1\fR.
.RE .RE
.sp
.ne 2
.na
\fBzfs_vdev_nia_delay\fR (int)
.ad
.RS 12n
For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
the number of concurrently-active I/O's is limited to *_min_active, unless
the vdev is "idle". When there are no interactive I/Os active (sync or
async), and zfs_vdev_nia_delay I/Os have completed since the last
interactive I/O, then the vdev is considered to be "idle", and the number
of concurrently-active non-interactive I/O's is increased to *_max_active.
See the section "ZFS I/O SCHEDULER".
.sp
Default value: \fB5\fR.
.RE
.sp
.ne 2
.na
\fBzfs_vdev_nia_credit\fR (int)
.ad
.RS 12n
Some HDDs tend to prioritize sequential I/O so high, that concurrent
random I/O latency reaches several seconds. On some HDDs it happens
even if sequential I/Os are submitted one at a time, and so setting
*_max_active to 1 does not help. To prevent non-interactive I/Os, like
scrub, from monopolizing the device no more than zfs_vdev_nia_credit
I/Os can be sent while there are outstanding incomplete interactive
I/Os. This enforced wait ensures the HDD services the interactive I/O
within a reasonable amount of time.
See the section "ZFS I/O SCHEDULER".
.sp
Default value: \fB5\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -121,17 +121,16 @@
/* /*
* The maximum number of i/os active to each device. Ideally, this will be >= * The maximum number of i/os active to each device. Ideally, this will be >=
* the sum of each queue's max_active. * the sum of each queue's max_active. It must be at least the sum of each
* queue's min_active.
*/ */
uint32_t zfs_vdev_max_active = 1000; uint32_t zfs_vdev_max_active = 1000;
/* /*
* Per-queue limits on the number of i/os active to each device. If the * Per-queue limits on the number of i/os active to each device. If the
* number of active i/os is < zfs_vdev_max_active, then the min_active comes * number of active i/os is < zfs_vdev_max_active, then the min_active comes
* into play. We will send min_active from each queue round-robin, and then * into play. We will send min_active from each queue, and then select from
* send from queues in the order defined by zio_priority_t up to max_active. * queues in the order defined by zio_priority_t.
* Some queues have additional mechanisms to limit number of active I/Os in
* addition to min_active and max_active, see below.
* *
* In general, smaller max_active's will lead to lower latency of synchronous * In general, smaller max_active's will lead to lower latency of synchronous
* operations. Larger max_active's may lead to higher overall throughput, * operations. Larger max_active's may lead to higher overall throughput,
@ -152,7 +151,7 @@ uint32_t zfs_vdev_async_read_max_active = 3;
uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_min_active = 2;
uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 3; uint32_t zfs_vdev_scrub_max_active = 2;
uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_removal_max_active = 2;
uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_min_active = 1;
@ -172,28 +171,6 @@ uint32_t zfs_vdev_rebuild_max_active = 3;
int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_min_dirty_percent = 30;
int zfs_vdev_async_write_active_max_dirty_percent = 60; int zfs_vdev_async_write_active_max_dirty_percent = 60;
/*
* For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
* the number of concurrently-active I/O's is limited to *_min_active, unless
* the vdev is "idle". When there are no interactive I/Os active (sync or
* async), and zfs_vdev_nia_delay I/Os have completed since the last
* interactive I/O, then the vdev is considered to be "idle", and the number
* of concurrently-active non-interactive I/O's is increased to *_max_active.
*/
uint_t zfs_vdev_nia_delay = 5;
/*
* Some HDDs tend to prioritize sequential I/O so high that concurrent
* random I/O latency reaches several seconds. On some HDDs it happens
* even if sequential I/Os are submitted one at a time, and so setting
* *_max_active to 1 does not help. To prevent non-interactive I/Os, like
* scrub, from monopolizing the device no more than zfs_vdev_nia_credit
* I/Os can be sent while there are outstanding incomplete interactive
* I/Os. This enforced wait ensures the HDD services the interactive I/O
* within a reasonable amount of time.
*/
uint_t zfs_vdev_nia_credit = 5;
/* /*
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
* For read I/Os, we also aggregate across small adjacency gaps; for writes * For read I/Os, we also aggregate across small adjacency gaps; for writes
@ -284,7 +261,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2)
} }
static int static int
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) vdev_queue_class_min_active(zio_priority_t p)
{ {
switch (p) { switch (p) {
case ZIO_PRIORITY_SYNC_READ: case ZIO_PRIORITY_SYNC_READ:
@ -296,19 +273,15 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_WRITE: case ZIO_PRIORITY_ASYNC_WRITE:
return (zfs_vdev_async_write_min_active); return (zfs_vdev_async_write_min_active);
case ZIO_PRIORITY_SCRUB: case ZIO_PRIORITY_SCRUB:
return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : return (zfs_vdev_scrub_min_active);
MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
case ZIO_PRIORITY_REMOVAL: case ZIO_PRIORITY_REMOVAL:
return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : return (zfs_vdev_removal_min_active);
MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
case ZIO_PRIORITY_INITIALIZING: case ZIO_PRIORITY_INITIALIZING:
return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: return (zfs_vdev_initializing_min_active);
MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
case ZIO_PRIORITY_TRIM: case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active); return (zfs_vdev_trim_min_active);
case ZIO_PRIORITY_REBUILD: case ZIO_PRIORITY_REBUILD:
return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : return (zfs_vdev_rebuild_min_active);
MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
default: default:
panic("invalid priority %u", p); panic("invalid priority %u", p);
return (0); return (0);
@ -364,7 +337,7 @@ vdev_queue_max_async_writes(spa_t *spa)
} }
static int static int
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
{ {
switch (p) { switch (p) {
case ZIO_PRIORITY_SYNC_READ: case ZIO_PRIORITY_SYNC_READ:
@ -376,34 +349,14 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_WRITE: case ZIO_PRIORITY_ASYNC_WRITE:
return (vdev_queue_max_async_writes(spa)); return (vdev_queue_max_async_writes(spa));
case ZIO_PRIORITY_SCRUB: case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
zfs_vdev_scrub_min_active));
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
return (zfs_vdev_scrub_min_active);
return (zfs_vdev_scrub_max_active); return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_REMOVAL: case ZIO_PRIORITY_REMOVAL:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
zfs_vdev_removal_min_active));
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
return (zfs_vdev_removal_min_active);
return (zfs_vdev_removal_max_active); return (zfs_vdev_removal_max_active);
case ZIO_PRIORITY_INITIALIZING: case ZIO_PRIORITY_INITIALIZING:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
zfs_vdev_initializing_min_active));
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
return (zfs_vdev_initializing_min_active);
return (zfs_vdev_initializing_max_active); return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM: case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active); return (zfs_vdev_trim_max_active);
case ZIO_PRIORITY_REBUILD: case ZIO_PRIORITY_REBUILD:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
zfs_vdev_rebuild_min_active));
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
return (zfs_vdev_rebuild_min_active);
return (zfs_vdev_rebuild_max_active); return (zfs_vdev_rebuild_max_active);
default: default:
panic("invalid priority %u", p); panic("invalid priority %u", p);
@ -419,24 +372,17 @@ static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq) vdev_queue_class_to_issue(vdev_queue_t *vq)
{ {
spa_t *spa = vq->vq_vdev->vdev_spa; spa_t *spa = vq->vq_vdev->vdev_spa;
zio_priority_t p, n; zio_priority_t p;
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE); return (ZIO_PRIORITY_NUM_QUEUEABLE);
/* /* find a queue that has not reached its minimum # outstanding i/os */
* Find a queue that has not reached its minimum # outstanding i/os. for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active < vq->vq_class[p].vqc_active <
vdev_queue_class_min_active(vq, p)) { vdev_queue_class_min_active(p))
vq->vq_last_prio = p;
return (p); return (p);
}
} }
/* /*
@ -446,10 +392,8 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active < vq->vq_class[p].vqc_active <
vdev_queue_class_max_active(spa, vq, p)) { vdev_queue_class_max_active(spa, p))
vq->vq_last_prio = p;
return (p); return (p);
}
} }
/* No eligible queued i/os */ /* No eligible queued i/os */
@ -549,20 +493,6 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
} }
} }
static boolean_t
vdev_queue_is_interactive(zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SCRUB:
case ZIO_PRIORITY_REMOVAL:
case ZIO_PRIORITY_INITIALIZING:
case ZIO_PRIORITY_REBUILD:
return (B_FALSE);
default:
return (B_TRUE);
}
}
static void static void
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{ {
@ -572,12 +502,6 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++; vq->vq_class[zio->io_priority].vqc_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
avl_add(&vq->vq_active_tree, zio); avl_add(&vq->vq_active_tree, zio);
if (shk->kstat != NULL) { if (shk->kstat != NULL) {
@ -596,13 +520,6 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--; vq->vq_class[zio->io_priority].vqc_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
else
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
avl_remove(&vq->vq_active_tree, zio); avl_remove(&vq->vq_active_tree, zio);
if (shk->kstat != NULL) { if (shk->kstat != NULL) {
@ -1148,12 +1065,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
"Min active rebuild I/Os per vdev"); "Min active rebuild I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
"Number of non-interactive I/Os to allow in sequence");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
"Number of non-interactive I/Os before _max_active");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
"Queue depth percentage for each top-level vdev"); "Queue depth percentage for each top-level vdev");
/* END CSTYLED */ /* END CSTYLED */