Revert "Reduce latency effects of non-interactive I/O"

Under certain conditions commit a3a4b8def appears to result in a hang, or poor performance, when importing a pool. Until the root cause can be identified it has been reverted from the release branch. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #11245
2025-11-06 14:24:51 +03:00 · 2020-11-30 09:38:15 -08:00 · 2020-11-30 09:38:15 -08:00 · 2c36eb763f
commit 2c36eb763f
parent a4ab0c607e
3 changed files with 18 additions and 145 deletions
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@ -148,9 +148,6 @@ struct vdev_queue {
 	avl_tree_t	vq_write_offset_tree;
 	avl_tree_t	vq_trim_offset_tree;
 	uint64_t	vq_last_offset;
 	zio_priority_t	vq_last_prio;	/* Last sent I/O priority. */
 	uint32_t	vq_ia_active;	/* Active interactive I/Os. */
 	uint32_t	vq_nia_credit;	/* Non-interactive I/Os credit. */
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	hrtime_t	vq_io_delta_ts;
 	zio_t		vq_io_search; /* used as local for stack reduction */
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -2011,7 +2011,8 @@ Default value: \fB1\fR.
 .ad
 .RS 12n
 The maximum number of I/Os active to each device.  Ideally, this will be >=
-the sum of each queue's max_active.  See the section "ZFS I/O SCHEDULER".
+the sum of each queue's max_active.  It must be at least the sum of each
 queue's min_active.  See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1,000\fR.
 .RE
@ -2160,42 +2161,6 @@ See the section "ZFS I/O SCHEDULER".
 Default value: \fB1\fR.
 .RE
 .sp
 .ne 2
 .na
 \fBzfs_vdev_nia_delay\fR (int)
 .ad
 .RS 12n
 For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
 the number of concurrently-active I/O's is limited to *_min_active, unless
 the vdev is "idle".  When there are no interactive I/Os active (sync or
 async), and zfs_vdev_nia_delay I/Os have completed since the last
 interactive I/O, then the vdev is considered to be "idle", and the number
 of concurrently-active non-interactive I/O's is increased to *_max_active.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB5\fR.
 .RE
 .sp
 .ne 2
 .na
 \fBzfs_vdev_nia_credit\fR (int)
 .ad
 .RS 12n
 Some HDDs tend to prioritize sequential I/O so high, that concurrent
 random I/O latency reaches several seconds.  On some HDDs it happens
 even if sequential I/Os are submitted one at a time, and so setting
 *_max_active to 1 does not help.  To prevent non-interactive I/Os, like
 scrub, from monopolizing the device no more than zfs_vdev_nia_credit
 I/Os can be sent while there are outstanding incomplete interactive
 I/Os.  This enforced wait ensures the HDD services the interactive I/O
 within a reasonable amount of time.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB5\fR.
 .RE
 .sp
 .ne 2
 .na
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@ -121,17 +121,16 @@
 /*
 * The maximum number of i/os active to each device.  Ideally, this will be >=
- * the sum of each queue's max_active.
+ * the sum of each queue's max_active.  It must be at least the sum of each
 * queue's min_active.
 */
 uint32_t zfs_vdev_max_active = 1000;
 /*
 * Per-queue limits on the number of i/os active to each device.  If the
 * number of active i/os is < zfs_vdev_max_active, then the min_active comes
- * into play.  We will send min_active from each queue round-robin, and then
+ * into play. We will send min_active from each queue, and then select from
- * send from queues in the order defined by zio_priority_t up to max_active.
+ * queues in the order defined by zio_priority_t.
 * Some queues have additional mechanisms to limit number of active I/Os in
 * addition to min_active and max_active, see below.
 *
 * In general, smaller max_active's will lead to lower latency of synchronous
 * operations.  Larger max_active's may lead to higher overall throughput,
@ -152,7 +151,7 @@ uint32_t zfs_vdev_async_read_max_active = 3;
 uint32_t zfs_vdev_async_write_min_active = 2;
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
-uint32_t zfs_vdev_scrub_max_active = 3;
+uint32_t zfs_vdev_scrub_max_active = 2;
 uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
 uint32_t zfs_vdev_initializing_min_active = 1;
@ -172,28 +171,6 @@ uint32_t zfs_vdev_rebuild_max_active = 3;
 int zfs_vdev_async_write_active_min_dirty_percent = 30;
 int zfs_vdev_async_write_active_max_dirty_percent = 60;
 /*
 * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
 * the number of concurrently-active I/O's is limited to *_min_active, unless
 * the vdev is "idle".  When there are no interactive I/Os active (sync or
 * async), and zfs_vdev_nia_delay I/Os have completed since the last
 * interactive I/O, then the vdev is considered to be "idle", and the number
 * of concurrently-active non-interactive I/O's is increased to *_max_active.
 */
 uint_t zfs_vdev_nia_delay = 5;
 /*
 * Some HDDs tend to prioritize sequential I/O so high that concurrent
 * random I/O latency reaches several seconds.  On some HDDs it happens
 * even if sequential I/Os are submitted one at a time, and so setting
 * *_max_active to 1 does not help.  To prevent non-interactive I/Os, like
 * scrub, from monopolizing the device no more than zfs_vdev_nia_credit
 * I/Os can be sent while there are outstanding incomplete interactive
 * I/Os.  This enforced wait ensures the HDD services the interactive I/O
 * within a reasonable amount of time.
 */
 uint_t zfs_vdev_nia_credit = 5;
 /*
 * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
 * For read I/Os, we also aggregate across small adjacency gaps; for writes
@ -284,7 +261,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2)
 }
 static int
-vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
+vdev_queue_class_min_active(zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
@ -296,19 +273,15 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (zfs_vdev_async_write_min_active);
 	case ZIO_PRIORITY_SCRUB:
-		return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
+		return (zfs_vdev_scrub_min_active);
 		    MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
 	case ZIO_PRIORITY_REMOVAL:
-		return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
+		return (zfs_vdev_removal_min_active);
 		    MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
 	case ZIO_PRIORITY_INITIALIZING:
-		return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
+		return (zfs_vdev_initializing_min_active);
 		    MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_min_active);
 	case ZIO_PRIORITY_REBUILD:
-		return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
+		return (zfs_vdev_rebuild_min_active);
 		    MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@ -364,7 +337,7 @@ vdev_queue_max_async_writes(spa_t *spa)
 }
 static int
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
@ -376,34 +349,14 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (vdev_queue_max_async_writes(spa));
 	case ZIO_PRIORITY_SCRUB:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_scrub_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (zfs_vdev_scrub_min_active);
 		return (zfs_vdev_scrub_max_active);
 	case ZIO_PRIORITY_REMOVAL:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_removal_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (zfs_vdev_removal_min_active);
 		return (zfs_vdev_removal_max_active);
 	case ZIO_PRIORITY_INITIALIZING:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_initializing_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (zfs_vdev_initializing_min_active);
 		return (zfs_vdev_initializing_max_active);
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_max_active);
 	case ZIO_PRIORITY_REBUILD:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_rebuild_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (zfs_vdev_rebuild_min_active);
 		return (zfs_vdev_rebuild_max_active);
 	default:
 		panic("invalid priority %u", p);
@ -419,25 +372,18 @@ static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
 	spa_t *spa = vq->vq_vdev->vdev_spa;
-	zio_priority_t p, n;
+	zio_priority_t p;
 	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
-	/*
+	/* find a queue that has not reached its minimum # outstanding i/os */
-	 * Find a queue that has not reached its minimum # outstanding i/os.
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
 	 * and vq_nia_credit limits.
 	 */
 	for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
 		p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
 		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_min_active(vq, p)) {
+		    vdev_queue_class_min_active(p))
 			vq->vq_last_prio = p;
 			return (p);
 	}
 	}
 	/*
 	 * If we haven't found a queue, look for one that hasn't reached its
@ -446,11 +392,9 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_max_active(spa, vq, p)) {
+		    vdev_queue_class_max_active(spa, p))
 			vq->vq_last_prio = p;
 			return (p);
 	}
 	}
 	/* No eligible queued i/os */
 	return (ZIO_PRIORITY_NUM_QUEUEABLE);
@ -549,20 +493,6 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 	}
 }
 static boolean_t
 vdev_queue_is_interactive(zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SCRUB:
 	case ZIO_PRIORITY_REMOVAL:
 	case ZIO_PRIORITY_INITIALIZING:
 	case ZIO_PRIORITY_REBUILD:
 		return (B_FALSE);
 	default:
 		return (B_TRUE);
 	}
 }
 static void
 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
@ -572,12 +502,6 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_class[zio->io_priority].vqc_active++;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (++vq->vq_ia_active == 1)
 			vq->vq_nia_credit = 1;
 	} else if (vq->vq_ia_active > 0) {
 		vq->vq_nia_credit--;
 	}
 	avl_add(&vq->vq_active_tree, zio);
 	if (shk->kstat != NULL) {
@ -596,13 +520,6 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_class[zio->io_priority].vqc_active--;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (--vq->vq_ia_active == 0)
 			vq->vq_nia_credit = 0;
 		else
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
 	avl_remove(&vq->vq_active_tree, zio);
 	if (shk->kstat != NULL) {
@ -1148,12 +1065,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
 	"Min active rebuild I/Os per vdev");
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
 	"Number of non-interactive I/Os to allow in sequence");
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
 	"Number of non-interactive I/Os before _max_active");
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
 	"Queue depth percentage for each top-level vdev");
 /* END CSTYLED */