From ec8501ee1274205f277a7287c3de8119d361afaf Mon Sep 17 00:00:00 2001 From: "Justin T. Gibbs" Date: Sat, 11 Apr 2015 14:51:06 -0400 Subject: [PATCH] 5313 Allow I/Os to be aggregated across ZIO priority classes Reviewed by: Andriy Gapon Reviewed by: Will Andrews Reviewed by: Matt Ahrens Reviewed by: George Wilson Approved by: Robert Mustacchi References: https://www.illumos.org/issues/5313 https://github.com/illumos/illumos-gate/commit/fe319232 Ported-by: DHE Signed-off-by: Brian Behlendorf Closes #3280 --- include/sys/vdev_impl.h | 2 ++ include/sys/zio.h | 1 + module/zfs/vdev_queue.c | 79 +++++++++++++++++++++++++---------------- 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index a8dc9510e..1048dec5e 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -113,6 +113,8 @@ struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_active_tree; + avl_tree_t vq_read_offset_tree; + avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; diff --git a/include/sys/zio.h b/include/sys/zio.h index 18e7a40a3..0368d9c59 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -427,6 +427,7 @@ struct zio { hrtime_t io_delta; /* vdev queue service delta */ uint64_t io_delay; /* vdev disk service delta (ticks) */ avl_node_t io_queue_node; + avl_node_t io_offset_node; /* Internal pipeline state */ enum zio_flag io_flags; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 3fa4219f2..cf0301649 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -190,6 +190,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (0); } +static inline avl_tree_t * +vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) +{ + return (&vq->vq_class[p].vqc_queued_tree); +} + +static inline avl_tree_t * +vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) +{ + ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE); + if (t == ZIO_TYPE_READ) + return (&vq->vq_read_offset_tree); + else + return (&vq->vq_write_offset_tree); +} + int vdev_queue_timestamp_compare(const void *x1, const void *x2) { @@ -303,7 +319,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) /* find a queue that has not reached its minimum # outstanding i/os */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_min_active(p)) return (p); @@ -314,7 +330,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_max_active(spa, p)) return (p); @@ -335,20 +351,27 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + int (*compfn) (const void *, const void *); + /* - * The synchronous i/o queues are FIFO rather than LBA ordered. - * This provides more consistent latency for these i/os, and - * they tend to not be tightly clustered anyway so there is - * little to no throughput loss. + * The synchronous i/o queues are dispatched in FIFO rather + * than LBA order. This provides more consistent latency for + * these i/os. */ - boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE); - avl_create(&vq->vq_class[p].vqc_queued_tree, - fifo ? vdev_queue_timestamp_compare : - vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); + if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) + compfn = vdev_queue_timestamp_compare; + else + compfn = vdev_queue_offset_compare; + avl_create(vdev_queue_class_tree(vq, p), compfn, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); } } @@ -359,8 +382,10 @@ vdev_queue_fini(vdev_t *vd) zio_priority_t p; for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(&vq->vq_class[p].vqc_queued_tree); + avl_destroy(vdev_queue_class_tree(vq, p)); avl_destroy(&vq->vq_active_tree); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); mutex_destroy(&vq->vq_lock); } @@ -372,7 +397,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) spa_stats_history_t *ssh = &spa->spa_stats.io_history; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); if (ssh->kstat != NULL) { mutex_enter(&ssh->lock); @@ -388,7 +414,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) spa_stats_history_t *ssh = &spa->spa_stats.io_history; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); if (ssh->kstat != NULL) { mutex_enter(&ssh->lock); @@ -472,8 +499,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) uint64_t maxgap = 0; uint64_t size; boolean_t stretch = B_FALSE; - vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority]; - avl_tree_t *t = &vqc->vqc_queued_tree; + avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) @@ -486,15 +512,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) zfs_vdev_aggregation_limit = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); - /* - * The synchronous i/o queues are not sorted by LBA, so we can't - * find adjacent i/os. These i/os tend to not be tightly clustered, - * or too large to aggregate, so this has little impact on performance. - */ - if (zio->io_priority == ZIO_PRIORITY_SYNC_READ || - zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) - return (NULL); - first = last = zio; if (zio->io_type == ZIO_TYPE_READ) @@ -627,7 +644,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; - vdev_queue_class_t *vqc; + avl_tree_t *tree; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); @@ -645,14 +662,14 @@ again: * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ - vqc = &vq->vq_class[p]; + tree = vdev_queue_class_tree(vq, p); vq->vq_io_search.io_timestamp = 0; vq->vq_io_search.io_offset = vq->vq_last_offset + 1; - VERIFY3P(avl_find(&vqc->vqc_queued_tree, &vq->vq_io_search, + VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); - zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER); + zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) - zio = avl_first(&vqc->vqc_queued_tree); + zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio);