mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-11-17 10:01:01 +03:00
Support re-prioritizing asynchronous prefetches
When sequential scrubs were merged, all calls to arc_read() (including prefetch IOs) were given ZIO_PRIORITY_ASYNC_READ. Unfortunately, this behaves badly with an existing issue where prefetch IOs cannot be re-prioritized after the issue. The result is that synchronous reads end up in the same vdev_queue as the scrub IOs and can have (in some workloads) multiple seconds of latency. This patch incorporates 2 changes. The first ensures that all scrub IOs are given ZIO_PRIORITY_SCRUB to allow the vdev_queue code to differentiate between these I/Os and user prefetches. Second, this patch introduces zio_change_priority() to provide the missing capability to upgrade a zio's priority. Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #6921 Closes #6926
This commit is contained in:
parent
993669a7bf
commit
a8b2e30685
@ -98,6 +98,7 @@ struct arc_callback {
|
||||
boolean_t acb_noauth;
|
||||
uint64_t acb_dsobj;
|
||||
zio_t *acb_zio_dummy;
|
||||
zio_t *acb_zio_head;
|
||||
arc_callback_t *acb_next;
|
||||
};
|
||||
|
||||
|
@ -108,7 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
|
||||
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
|
||||
|
@ -123,6 +123,7 @@ extern void vdev_queue_init(vdev_t *vd);
|
||||
extern void vdev_queue_fini(vdev_t *vd);
|
||||
extern zio_t *vdev_queue_io(zio_t *zio);
|
||||
extern void vdev_queue_io_done(zio_t *zio);
|
||||
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
|
||||
|
||||
extern int vdev_queue_length(vdev_t *vd);
|
||||
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
|
||||
|
@ -586,6 +586,8 @@ extern void zio_vdev_io_bypass(zio_t *zio);
|
||||
extern void zio_vdev_io_reissue(zio_t *zio);
|
||||
extern void zio_vdev_io_redone(zio_t *zio);
|
||||
|
||||
extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
|
||||
|
||||
extern void zio_checksum_verified(zio_t *zio);
|
||||
extern int zio_worst_error(int e1, int e2);
|
||||
|
||||
|
@ -663,7 +663,7 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_dnode_limit;
|
||||
kstat_named_t arcstat_meta_max;
|
||||
kstat_named_t arcstat_meta_min;
|
||||
kstat_named_t arcstat_sync_wait_for_async;
|
||||
kstat_named_t arcstat_async_upgrade_sync;
|
||||
kstat_named_t arcstat_demand_hit_predictive_prefetch;
|
||||
kstat_named_t arcstat_demand_hit_prescient_prefetch;
|
||||
kstat_named_t arcstat_need_free;
|
||||
@ -763,7 +763,7 @@ static arc_stats_t arc_stats = {
|
||||
{ "arc_dnode_limit", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_max", KSTAT_DATA_UINT64 },
|
||||
{ "arc_meta_min", KSTAT_DATA_UINT64 },
|
||||
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
|
||||
{ "async_upgrade_sync", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
|
||||
{ "arc_need_free", KSTAT_DATA_UINT64 },
|
||||
@ -5911,32 +5911,20 @@ top:
|
||||
*arc_flags |= ARC_FLAG_CACHED;
|
||||
|
||||
if (HDR_IO_IN_PROGRESS(hdr)) {
|
||||
zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
|
||||
|
||||
ASSERT3P(head_zio, !=, NULL);
|
||||
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
|
||||
priority == ZIO_PRIORITY_SYNC_READ) {
|
||||
/*
|
||||
* This sync read must wait for an
|
||||
* in-progress async read (e.g. a predictive
|
||||
* prefetch). Async reads are queued
|
||||
* separately at the vdev_queue layer, so
|
||||
* this is a form of priority inversion.
|
||||
* Ideally, we would "inherit" the demand
|
||||
* i/o's priority by moving the i/o from
|
||||
* the async queue to the synchronous queue,
|
||||
* but there is currently no mechanism to do
|
||||
* so. Track this so that we can evaluate
|
||||
* the magnitude of this potential performance
|
||||
* problem.
|
||||
*
|
||||
* Note that if the prefetch i/o is already
|
||||
* active (has been issued to the device),
|
||||
* the prefetch improved performance, because
|
||||
* we issued it sooner than we would have
|
||||
* without the prefetch.
|
||||
* This is a sync read that needs to wait for
|
||||
* an in-flight async read. Request that the
|
||||
* zio have its priority upgraded.
|
||||
*/
|
||||
DTRACE_PROBE1(arc__sync__wait__for__async,
|
||||
zio_change_priority(head_zio, priority);
|
||||
DTRACE_PROBE1(arc__async__upgrade__sync,
|
||||
arc_buf_hdr_t *, hdr);
|
||||
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
|
||||
ARCSTAT_BUMP(arcstat_async_upgrade_sync);
|
||||
}
|
||||
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
|
||||
arc_hdr_clear_flags(hdr,
|
||||
@ -5966,6 +5954,7 @@ top:
|
||||
spa, NULL, NULL, NULL, zio_flags);
|
||||
|
||||
ASSERT3P(acb->acb_done, !=, NULL);
|
||||
acb->acb_zio_head = head_zio;
|
||||
acb->acb_next = hdr->b_l1hdr.b_acb;
|
||||
hdr->b_l1hdr.b_acb = acb;
|
||||
mutex_exit(hash_lock);
|
||||
@ -6182,14 +6171,17 @@ top:
|
||||
vd = NULL;
|
||||
}
|
||||
|
||||
if (priority == ZIO_PRIORITY_ASYNC_READ)
|
||||
/*
|
||||
* We count both async reads and scrub IOs as asynchronous so
|
||||
* that both can be upgraded in the event of a cache hit while
|
||||
* the read IO is still in-flight.
|
||||
*/
|
||||
if (priority == ZIO_PRIORITY_ASYNC_READ ||
|
||||
priority == ZIO_PRIORITY_SCRUB)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
|
||||
else
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
|
||||
|
||||
if (hash_lock != NULL)
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
/*
|
||||
* At this point, we have a level 1 cache miss. Try again in
|
||||
* L2ARC if possible.
|
||||
@ -6260,6 +6252,10 @@ top:
|
||||
ZIO_FLAG_CANFAIL |
|
||||
ZIO_FLAG_DONT_PROPAGATE |
|
||||
ZIO_FLAG_DONT_RETRY, B_FALSE);
|
||||
acb->acb_zio_head = rzio;
|
||||
|
||||
if (hash_lock != NULL)
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
|
||||
zio_t *, rzio);
|
||||
@ -6276,6 +6272,8 @@ top:
|
||||
goto out;
|
||||
|
||||
/* l2arc read error; goto zio_read() */
|
||||
if (hash_lock != NULL)
|
||||
mutex_enter(hash_lock);
|
||||
} else {
|
||||
DTRACE_PROBE1(l2arc__miss,
|
||||
arc_buf_hdr_t *, hdr);
|
||||
@ -6296,6 +6294,10 @@ top:
|
||||
|
||||
rzio = zio_read(pio, spa, bp, hdr_abd, size,
|
||||
arc_read_done, hdr, priority, zio_flags, zb);
|
||||
acb->acb_zio_head = rzio;
|
||||
|
||||
if (hash_lock != NULL)
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
if (*arc_flags & ARC_FLAG_WAIT) {
|
||||
rc = zio_wait(rzio);
|
||||
|
@ -1529,7 +1529,7 @@ dsl_scan_prefetch_thread(void *arg)
|
||||
/* issue the prefetch asynchronously */
|
||||
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
|
||||
&spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
|
||||
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb);
|
||||
ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
|
||||
|
||||
kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
|
||||
}
|
||||
@ -1611,7 +1611,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
||||
arc_buf_t *buf;
|
||||
|
||||
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
|
||||
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
|
||||
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
|
||||
if (err) {
|
||||
scn->scn_phys.scn_errors++;
|
||||
return (err);
|
||||
@ -1639,7 +1639,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
||||
}
|
||||
|
||||
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
|
||||
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
|
||||
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
|
||||
if (err) {
|
||||
scn->scn_phys.scn_errors++;
|
||||
return (err);
|
||||
@ -1658,7 +1658,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
||||
arc_buf_t *buf;
|
||||
|
||||
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
|
||||
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
|
||||
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
|
||||
if (err) {
|
||||
scn->scn_phys.scn_errors++;
|
||||
return (err);
|
||||
|
@ -803,6 +803,48 @@ vdev_queue_io_done(zio_t *zio)
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
|
||||
void
|
||||
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
|
||||
{
|
||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||
avl_tree_t *tree;
|
||||
|
||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ) {
|
||||
if (priority != ZIO_PRIORITY_SYNC_READ &&
|
||||
priority != ZIO_PRIORITY_ASYNC_READ &&
|
||||
priority != ZIO_PRIORITY_SCRUB)
|
||||
priority = ZIO_PRIORITY_ASYNC_READ;
|
||||
} else {
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
if (priority != ZIO_PRIORITY_SYNC_WRITE &&
|
||||
priority != ZIO_PRIORITY_ASYNC_WRITE)
|
||||
priority = ZIO_PRIORITY_ASYNC_WRITE;
|
||||
}
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
/*
|
||||
* If the zio is in none of the queues we can simply change
|
||||
* the priority. If the zio is waiting to be submitted we must
|
||||
* remove it from the queue and re-insert it with the new priority.
|
||||
* Otherwise, the zio is currently active and we cannot change its
|
||||
* priority.
|
||||
*/
|
||||
tree = vdev_queue_class_tree(vq, zio->io_priority);
|
||||
if (avl_find(tree, zio, NULL) == zio) {
|
||||
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
zio->io_priority = priority;
|
||||
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
|
||||
zio->io_priority = priority;
|
||||
}
|
||||
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* As these two methods are only used for load calculations we're not
|
||||
* concerned if we get an incorrect value on 32bit platforms due to lack of
|
||||
|
@ -539,6 +539,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl)
|
||||
{
|
||||
list_t *cl = &pio->io_child_list;
|
||||
|
||||
ASSERT(MUTEX_HELD(&pio->io_lock));
|
||||
|
||||
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
|
||||
if (*zl == NULL)
|
||||
return (NULL);
|
||||
@ -573,8 +575,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
||||
zl->zl_parent = pio;
|
||||
zl->zl_child = cio;
|
||||
|
||||
mutex_enter(&cio->io_lock);
|
||||
mutex_enter(&pio->io_lock);
|
||||
mutex_enter(&cio->io_lock);
|
||||
|
||||
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
||||
|
||||
@ -587,8 +589,8 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
||||
pio->io_child_count++;
|
||||
cio->io_parent_count++;
|
||||
|
||||
mutex_exit(&pio->io_lock);
|
||||
mutex_exit(&cio->io_lock);
|
||||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -597,8 +599,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
||||
ASSERT(zl->zl_parent == pio);
|
||||
ASSERT(zl->zl_child == cio);
|
||||
|
||||
mutex_enter(&cio->io_lock);
|
||||
mutex_enter(&pio->io_lock);
|
||||
mutex_enter(&cio->io_lock);
|
||||
|
||||
list_remove(&pio->io_child_list, zl);
|
||||
list_remove(&cio->io_parent_list, zl);
|
||||
@ -606,8 +608,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
||||
pio->io_child_count--;
|
||||
cio->io_parent_count--;
|
||||
|
||||
mutex_exit(&pio->io_lock);
|
||||
mutex_exit(&cio->io_lock);
|
||||
mutex_exit(&pio->io_lock);
|
||||
kmem_cache_free(zio_link_cache, zl);
|
||||
}
|
||||
|
||||
@ -1963,14 +1965,16 @@ zio_reexecute(zio_t *pio)
|
||||
* cannot be affected by any side effects of reexecuting 'cio'.
|
||||
*/
|
||||
zio_link_t *zl = NULL;
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
||||
cio_next = zio_walk_children(pio, &zl);
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
||||
pio->io_children[cio->io_child_type][w]++;
|
||||
mutex_exit(&pio->io_lock);
|
||||
zio_reexecute(cio);
|
||||
mutex_enter(&pio->io_lock);
|
||||
}
|
||||
mutex_exit(&pio->io_lock);
|
||||
|
||||
/*
|
||||
* Now that all children have been reexecuted, execute the parent.
|
||||
@ -3474,6 +3478,35 @@ zio_vdev_io_done(zio_t *zio)
|
||||
return (ZIO_PIPELINE_CONTINUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is used to change the priority of an existing zio that is
|
||||
* currently in-flight. This is used by the arc to upgrade priority in the
|
||||
* event that a demand read is made for a block that is currently queued
|
||||
* as a scrub or async read IO. Otherwise, the high priority read request
|
||||
* would end up having to wait for the lower priority IO.
|
||||
*/
|
||||
void
|
||||
zio_change_priority(zio_t *pio, zio_priority_t priority)
|
||||
{
|
||||
zio_t *cio, *cio_next;
|
||||
zio_link_t *zl = NULL;
|
||||
|
||||
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
|
||||
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
|
||||
vdev_queue_change_io_priority(pio, priority);
|
||||
} else {
|
||||
pio->io_priority = priority;
|
||||
}
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
||||
cio_next = zio_walk_children(pio, &zl);
|
||||
zio_change_priority(cio, priority);
|
||||
}
|
||||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* For non-raidz ZIOs, we can just copy aside the bad data read from the
|
||||
* disk, and use that to finish the checksum ereport later.
|
||||
|
0
spa_stats.io_history
Normal file
0
spa_stats.io_history
Normal file
@ -41,9 +41,9 @@ function get_prefetched_demand_reads
|
||||
echo $demand_reads
|
||||
}
|
||||
|
||||
function get_sync_wait_for_async
|
||||
function get_async_upgrade_sync
|
||||
{
|
||||
typeset -l sync_wait=`awk '$1 == "sync_wait_for_async" \
|
||||
typeset -l sync_wait=`awk '$1 == "async_upgrade_sync" \
|
||||
{ print $3 }' $zfs_kstats/arcstats`
|
||||
|
||||
echo $sync_wait
|
||||
@ -59,7 +59,7 @@ poolname=$1
|
||||
interval=$2
|
||||
prefetch_ios=$(get_prefetch_ios)
|
||||
prefetched_demand_reads=$(get_prefetched_demand_reads)
|
||||
sync_wait_for_async=$(get_sync_wait_for_async)
|
||||
async_upgrade_sync=$(get_async_upgrade_sync)
|
||||
|
||||
while true
|
||||
do
|
||||
@ -73,10 +73,10 @@ do
|
||||
$(( $new_prefetched_demand_reads - $prefetched_demand_reads ))
|
||||
prefetched_demand_reads=$new_prefetched_demand_reads
|
||||
|
||||
new_sync_wait_for_async=$(get_sync_wait_for_async)
|
||||
printf "%-24s\t%u\n" "sync_wait_for_async" \
|
||||
$(( $new_sync_wait_for_async - $sync_wait_for_async ))
|
||||
sync_wait_for_async=$new_sync_wait_for_async
|
||||
new_async_upgrade_sync=$(get_async_upgrade_sync)
|
||||
printf "%-24s\t%u\n" "async_upgrade_sync" \
|
||||
$(( $new_async_upgrade_sync - $async_upgrade_sync ))
|
||||
async_upgrade_sync=$new_async_upgrade_sync
|
||||
|
||||
sleep $interval
|
||||
done
|
||||
|
Loading…
Reference in New Issue
Block a user