mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-12 19:20:28 +03:00
5313 Allow I/Os to be aggregated across ZIO priority classes
Reviewed by: Andriy Gapon <avg@FreeBSD.org> Reviewed by: Will Andrews <willa@SpectraLogic.com> Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com> References: https://www.illumos.org/issues/5313 https://github.com/illumos/illumos-gate/commit/fe319232 Ported-by: DHE <git@dehacked.net> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3280
This commit is contained in:
parent
0bf8501ae1
commit
ec8501ee12
@ -113,6 +113,8 @@ struct vdev_queue {
|
|||||||
vdev_t *vq_vdev;
|
vdev_t *vq_vdev;
|
||||||
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
|
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
|
||||||
avl_tree_t vq_active_tree;
|
avl_tree_t vq_active_tree;
|
||||||
|
avl_tree_t vq_read_offset_tree;
|
||||||
|
avl_tree_t vq_write_offset_tree;
|
||||||
uint64_t vq_last_offset;
|
uint64_t vq_last_offset;
|
||||||
hrtime_t vq_io_complete_ts; /* time last i/o completed */
|
hrtime_t vq_io_complete_ts; /* time last i/o completed */
|
||||||
hrtime_t vq_io_delta_ts;
|
hrtime_t vq_io_delta_ts;
|
||||||
|
@ -427,6 +427,7 @@ struct zio {
|
|||||||
hrtime_t io_delta; /* vdev queue service delta */
|
hrtime_t io_delta; /* vdev queue service delta */
|
||||||
uint64_t io_delay; /* vdev disk service delta (ticks) */
|
uint64_t io_delay; /* vdev disk service delta (ticks) */
|
||||||
avl_node_t io_queue_node;
|
avl_node_t io_queue_node;
|
||||||
|
avl_node_t io_offset_node;
|
||||||
|
|
||||||
/* Internal pipeline state */
|
/* Internal pipeline state */
|
||||||
enum zio_flag io_flags;
|
enum zio_flag io_flags;
|
||||||
|
@ -190,6 +190,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
|
|||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline avl_tree_t *
|
||||||
|
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
|
||||||
|
{
|
||||||
|
return (&vq->vq_class[p].vqc_queued_tree);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline avl_tree_t *
|
||||||
|
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
|
||||||
|
{
|
||||||
|
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
|
||||||
|
if (t == ZIO_TYPE_READ)
|
||||||
|
return (&vq->vq_read_offset_tree);
|
||||||
|
else
|
||||||
|
return (&vq->vq_write_offset_tree);
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
vdev_queue_timestamp_compare(const void *x1, const void *x2)
|
vdev_queue_timestamp_compare(const void *x1, const void *x2)
|
||||||
{
|
{
|
||||||
@ -303,7 +319,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
|
|||||||
|
|
||||||
/* find a queue that has not reached its minimum # outstanding i/os */
|
/* find a queue that has not reached its minimum # outstanding i/os */
|
||||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||||
if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
|
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
||||||
vq->vq_class[p].vqc_active <
|
vq->vq_class[p].vqc_active <
|
||||||
vdev_queue_class_min_active(p))
|
vdev_queue_class_min_active(p))
|
||||||
return (p);
|
return (p);
|
||||||
@ -314,7 +330,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
|
|||||||
* maximum # outstanding i/os.
|
* maximum # outstanding i/os.
|
||||||
*/
|
*/
|
||||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||||
if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
|
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
||||||
vq->vq_class[p].vqc_active <
|
vq->vq_class[p].vqc_active <
|
||||||
vdev_queue_class_max_active(spa, p))
|
vdev_queue_class_max_active(spa, p))
|
||||||
return (p);
|
return (p);
|
||||||
@ -335,19 +351,26 @@ vdev_queue_init(vdev_t *vd)
|
|||||||
|
|
||||||
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
|
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
|
||||||
sizeof (zio_t), offsetof(struct zio, io_queue_node));
|
sizeof (zio_t), offsetof(struct zio, io_queue_node));
|
||||||
|
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
|
||||||
|
vdev_queue_offset_compare, sizeof (zio_t),
|
||||||
|
offsetof(struct zio, io_offset_node));
|
||||||
|
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
|
||||||
|
vdev_queue_offset_compare, sizeof (zio_t),
|
||||||
|
offsetof(struct zio, io_offset_node));
|
||||||
|
|
||||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||||
|
int (*compfn) (const void *, const void *);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The synchronous i/o queues are FIFO rather than LBA ordered.
|
* The synchronous i/o queues are dispatched in FIFO rather
|
||||||
* This provides more consistent latency for these i/os, and
|
* than LBA order. This provides more consistent latency for
|
||||||
* they tend to not be tightly clustered anyway so there is
|
* these i/os.
|
||||||
* little to no throughput loss.
|
|
||||||
*/
|
*/
|
||||||
boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
|
if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
|
||||||
p == ZIO_PRIORITY_SYNC_WRITE);
|
compfn = vdev_queue_timestamp_compare;
|
||||||
avl_create(&vq->vq_class[p].vqc_queued_tree,
|
else
|
||||||
fifo ? vdev_queue_timestamp_compare :
|
compfn = vdev_queue_offset_compare;
|
||||||
vdev_queue_offset_compare,
|
avl_create(vdev_queue_class_tree(vq, p), compfn,
|
||||||
sizeof (zio_t), offsetof(struct zio, io_queue_node));
|
sizeof (zio_t), offsetof(struct zio, io_queue_node));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -359,8 +382,10 @@ vdev_queue_fini(vdev_t *vd)
|
|||||||
zio_priority_t p;
|
zio_priority_t p;
|
||||||
|
|
||||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
|
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
|
||||||
avl_destroy(&vq->vq_class[p].vqc_queued_tree);
|
avl_destroy(vdev_queue_class_tree(vq, p));
|
||||||
avl_destroy(&vq->vq_active_tree);
|
avl_destroy(&vq->vq_active_tree);
|
||||||
|
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
|
||||||
|
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
|
||||||
|
|
||||||
mutex_destroy(&vq->vq_lock);
|
mutex_destroy(&vq->vq_lock);
|
||||||
}
|
}
|
||||||
@ -372,7 +397,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
|
|||||||
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
|
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
|
||||||
|
|
||||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||||
avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
|
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||||
|
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
|
||||||
|
|
||||||
if (ssh->kstat != NULL) {
|
if (ssh->kstat != NULL) {
|
||||||
mutex_enter(&ssh->lock);
|
mutex_enter(&ssh->lock);
|
||||||
@ -388,7 +414,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
|
|||||||
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
|
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
|
||||||
|
|
||||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||||
avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
|
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||||
|
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
|
||||||
|
|
||||||
if (ssh->kstat != NULL) {
|
if (ssh->kstat != NULL) {
|
||||||
mutex_enter(&ssh->lock);
|
mutex_enter(&ssh->lock);
|
||||||
@ -472,8 +499,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||||||
uint64_t maxgap = 0;
|
uint64_t maxgap = 0;
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
boolean_t stretch = B_FALSE;
|
boolean_t stretch = B_FALSE;
|
||||||
vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
|
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
|
||||||
avl_tree_t *t = &vqc->vqc_queued_tree;
|
|
||||||
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
||||||
|
|
||||||
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
|
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
|
||||||
@ -486,15 +512,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||||||
zfs_vdev_aggregation_limit =
|
zfs_vdev_aggregation_limit =
|
||||||
MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
|
MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
|
||||||
|
|
||||||
/*
|
|
||||||
* The synchronous i/o queues are not sorted by LBA, so we can't
|
|
||||||
* find adjacent i/os. These i/os tend to not be tightly clustered,
|
|
||||||
* or too large to aggregate, so this has little impact on performance.
|
|
||||||
*/
|
|
||||||
if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
|
|
||||||
zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
|
|
||||||
return (NULL);
|
|
||||||
|
|
||||||
first = last = zio;
|
first = last = zio;
|
||||||
|
|
||||||
if (zio->io_type == ZIO_TYPE_READ)
|
if (zio->io_type == ZIO_TYPE_READ)
|
||||||
@ -627,7 +644,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
|
|||||||
zio_t *zio, *aio;
|
zio_t *zio, *aio;
|
||||||
zio_priority_t p;
|
zio_priority_t p;
|
||||||
avl_index_t idx;
|
avl_index_t idx;
|
||||||
vdev_queue_class_t *vqc;
|
avl_tree_t *tree;
|
||||||
|
|
||||||
again:
|
again:
|
||||||
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
||||||
@ -645,14 +662,14 @@ again:
|
|||||||
*
|
*
|
||||||
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
|
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
|
||||||
*/
|
*/
|
||||||
vqc = &vq->vq_class[p];
|
tree = vdev_queue_class_tree(vq, p);
|
||||||
vq->vq_io_search.io_timestamp = 0;
|
vq->vq_io_search.io_timestamp = 0;
|
||||||
vq->vq_io_search.io_offset = vq->vq_last_offset + 1;
|
vq->vq_io_search.io_offset = vq->vq_last_offset + 1;
|
||||||
VERIFY3P(avl_find(&vqc->vqc_queued_tree, &vq->vq_io_search,
|
VERIFY3P(avl_find(tree, &vq->vq_io_search,
|
||||||
&idx), ==, NULL);
|
&idx), ==, NULL);
|
||||||
zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
|
zio = avl_nearest(tree, idx, AVL_AFTER);
|
||||||
if (zio == NULL)
|
if (zio == NULL)
|
||||||
zio = avl_first(&vqc->vqc_queued_tree);
|
zio = avl_first(tree);
|
||||||
ASSERT3U(zio->io_priority, ==, p);
|
ASSERT3U(zio->io_priority, ==, p);
|
||||||
|
|
||||||
aio = vdev_queue_aggregate(vq, zio);
|
aio = vdev_queue_aggregate(vq, zio);
|
||||||
|
Loading…
Reference in New Issue
Block a user