mirror_zfs/module/zfs/vdev_queue.c

539 lines
14 KiB
C
Raw Normal View History

2008-11-20 23:01:55 +03:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
2009-02-18 23:51:31 +03:00
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
2008-11-20 23:01:55 +03:00
* Use is subject to license terms.
*/
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
2008-11-20 23:01:55 +03:00
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
#include <sys/spa_impl.h>
2008-11-20 23:01:55 +03:00
#include <sys/zio.h>
#include <sys/avl.h>
#include <sys/kstat.h>
2008-11-20 23:01:55 +03:00
/*
* These tunables are for performance analysis.
*/
/* The maximum number of I/Os concurrently pending to each device. */
int zfs_vdev_max_pending = 10;
2008-11-20 23:01:55 +03:00
/*
* The initial number of I/Os pending to each device, before it starts ramping
* up to zfs_vdev_max_pending.
2008-11-20 23:01:55 +03:00
*/
int zfs_vdev_min_pending = 4;
Illumos #3618 ::zio dcmd does not show timestamp data 3618 ::zio dcmd does not show timestamp data Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <gwilson@zfsmail.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Dan McDonald <danmcd@nexenta.com> References: http://www.illumos.org/issues/3618 illumos/illumos-gate@c55e05cb35da47582b7afd38734d2f0d9c6deb40 Notes on porting to ZFS on Linux: The original changeset mostly deals with mdb ::zio dcmd. However, in order to provide the requested functionality it modifies vdev and zio structures to keep the timing data in nanoseconds instead of ticks. It is these changes that are ported over in the commit in hand. One visible change of this commit is that the default value of 'zfs_vdev_time_shift' tunable is changed: zfs_vdev_time_shift = 6 to zfs_vdev_time_shift = 29 The original value of 6 was inherited from OpenSolaris and was subotimal - since it shifted the raw tick value - it didn't compensate for different tick frequencies on Linux and OpenSolaris. The former has HZ=1000, while the latter HZ=100. (Which itself led to other interesting performance anomalies under non-trivial load. The deadline scheduler delays the IO according to its priority - the lower priority the further the deadline is set. The delay is measured in units of "shifted ticks". Since the HZ value was 10 times higher, the delay units were 10 times shorter. Thus really low priority IO like resilver (delay is 10 units) and scrub (delay is 20 units) were scheduled much sooner than intended. The overall effect is that resilver and scrub IO consumed more bandwidth at the expense of the other IO.) Now that the bookkeeping is done is nanoseconds the shift behaves correctly for any tick frequency (HZ). Ported-by: Cyril Plisko <cyril.plisko@mountall.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1643
2013-03-22 02:47:36 +04:00
/*
* The deadlines are grouped into buckets based on zfs_vdev_time_shift:
* deadline = pri + gethrtime() >> time_shift)
*/
int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
2008-11-20 23:01:55 +03:00
/* exponential I/O issue ramp-up rate */
int zfs_vdev_ramp_rate = 2;
/*
2009-08-18 22:43:27 +04:00
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
* For read I/Os, we also aggregate across small adjacency gaps; for writes
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
2008-11-20 23:01:55 +03:00
*/
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
2009-07-03 02:44:48 +04:00
int zfs_vdev_read_gap_limit = 32 << 10;
2009-08-18 22:43:27 +04:00
int zfs_vdev_write_gap_limit = 4 << 10;
2008-11-20 23:01:55 +03:00
/*
* Virtual device vector for disk I/O scheduling.
*/
int
vdev_queue_deadline_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
if (z1->io_deadline < z2->io_deadline)
return (-1);
if (z1->io_deadline > z2->io_deadline)
return (1);
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
return (1);
if (z1 < z2)
return (-1);
if (z1 > z2)
return (1);
return (0);
}
int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
return (1);
if (z1 < z2)
return (-1);
if (z1 > z2)
return (1);
return (0);
}
void
vdev_queue_init(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
int i;
2008-11-20 23:01:55 +03:00
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
sizeof (zio_t), offsetof(struct zio, io_deadline_node));
avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_offset_node));
avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_offset_node));
avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_offset_node));
/*
* A list of buffers which can be used for aggregate I/O, this
* avoids the need to allocate them on demand when memory is low.
*/
list_create(&vq->vq_io_list, sizeof (vdev_io_t),
offsetof(vdev_io_t, vi_node));
for (i = 0; i < zfs_vdev_max_pending; i++)
list_insert_tail(&vq->vq_io_list, zio_vdev_alloc());
2008-11-20 23:01:55 +03:00
}
void
vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
vdev_io_t *vi;
2008-11-20 23:01:55 +03:00
avl_destroy(&vq->vq_deadline_tree);
avl_destroy(&vq->vq_read_tree);
avl_destroy(&vq->vq_write_tree);
avl_destroy(&vq->vq_pending_tree);
while ((vi = list_head(&vq->vq_io_list)) != NULL) {
list_remove(&vq->vq_io_list, vi);
zio_vdev_free(vi);
}
list_destroy(&vq->vq_io_list);
2008-11-20 23:01:55 +03:00
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
2008-11-20 23:01:55 +03:00
avl_add(&vq->vq_deadline_tree, zio);
avl_add(zio->io_vdev_tree, zio);
if (ssh->kstat != NULL) {
mutex_enter(&ssh->lock);
kstat_waitq_enter(ssh->kstat->ks_data);
mutex_exit(&ssh->lock);
}
2008-11-20 23:01:55 +03:00
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
2008-11-20 23:01:55 +03:00
avl_remove(&vq->vq_deadline_tree, zio);
avl_remove(zio->io_vdev_tree, zio);
if (ssh->kstat != NULL) {
mutex_enter(&ssh->lock);
kstat_waitq_exit(ssh->kstat->ks_data);
mutex_exit(&ssh->lock);
}
}
static void
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
avl_add(&vq->vq_pending_tree, zio);
if (ssh->kstat != NULL) {
mutex_enter(&ssh->lock);
kstat_runq_enter(ssh->kstat->ks_data);
mutex_exit(&ssh->lock);
}
}
static void
vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
avl_remove(&vq->vq_pending_tree, zio);
if (ssh->kstat != NULL) {
kstat_io_t *ksio = ssh->kstat->ks_data;
mutex_enter(&ssh->lock);
kstat_runq_exit(ksio);
if (zio->io_type == ZIO_TYPE_READ) {
ksio->reads++;
ksio->nread += zio->io_size;
} else if (zio->io_type == ZIO_TYPE_WRITE) {
ksio->writes++;
ksio->nwritten += zio->io_size;
}
mutex_exit(&ssh->lock);
}
2008-11-20 23:01:55 +03:00
}
static void
vdev_queue_agg_io_done(zio_t *aio)
{
vdev_queue_t *vq = &aio->io_vd->vdev_queue;
vdev_io_t *vi = aio->io_data;
2009-02-18 23:51:31 +03:00
zio_t *pio;
2008-11-20 23:01:55 +03:00
2009-02-18 23:51:31 +03:00
while ((pio = zio_walk_parents(aio)) != NULL)
2008-11-20 23:01:55 +03:00
if (aio->io_type == ZIO_TYPE_READ)
2009-02-18 23:51:31 +03:00
bcopy((char *)aio->io_data + (pio->io_offset -
aio->io_offset), pio->io_data, pio->io_size);
2008-11-20 23:01:55 +03:00
mutex_enter(&vq->vq_lock);
list_insert_tail(&vq->vq_io_list, vi);
mutex_exit(&vq->vq_lock);
2008-11-20 23:01:55 +03:00
}
2009-07-03 02:44:48 +04:00
/*
* Compute the range spanned by two i/os, which is the endpoint of the last
* (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
* Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
* thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
*/
#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
2008-11-20 23:01:55 +03:00
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
2009-08-18 22:43:27 +04:00
zio_t *fio, *lio, *aio, *dio, *nio, *mio;
2009-02-18 23:51:31 +03:00
avl_tree_t *t;
vdev_io_t *vi;
2009-01-16 00:59:39 +03:00
int flags;
uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
2009-07-03 02:44:48 +04:00
uint64_t maxgap;
2009-08-18 22:43:27 +04:00
int stretch;
2008-11-20 23:01:55 +03:00
2009-08-18 22:43:27 +04:00
again:
2008-11-20 23:01:55 +03:00
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
avl_numnodes(&vq->vq_deadline_tree) == 0)
return (NULL);
fio = lio = avl_first(&vq->vq_deadline_tree);
2009-02-18 23:51:31 +03:00
t = fio->io_vdev_tree;
2009-01-16 00:59:39 +03:00
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
2009-07-03 02:44:48 +04:00
maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
2009-01-16 00:59:39 +03:00
vi = list_head(&vq->vq_io_list);
if (vi == NULL) {
vi = zio_vdev_alloc();
list_insert_head(&vq->vq_io_list, vi);
}
2009-01-16 00:59:39 +03:00
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/*
2009-08-18 22:43:27 +04:00
* We can aggregate I/Os that are sufficiently adjacent and of
* the same flavor, as expressed by the AGG_INHERIT flags.
* The latter requirement is necessary so that certain
* attributes of the I/O, such as whether it's a normal I/O
* or a scrub/resilver, can be preserved in the aggregate.
* We can include optional I/Os, but don't allow them
* to begin a range as they add no benefit in that situation.
*/
/*
* We keep track of the last non-optional I/O.
*/
mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
/*
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-option I/O.
2009-01-16 00:59:39 +03:00
*/
2009-02-18 23:51:31 +03:00
while ((dio = AVL_PREV(t, fio)) != NULL &&
2009-01-16 00:59:39 +03:00
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
2009-08-18 22:43:27 +04:00
IO_SPAN(dio, lio) <= maxspan &&
IO_GAP(dio, fio) <= maxgap) {
2009-01-16 00:59:39 +03:00
fio = dio;
2009-08-18 22:43:27 +04:00
if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
mio = fio;
}
/*
* Skip any initial optional I/Os.
*/
while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
fio = AVL_NEXT(t, fio);
ASSERT(fio != NULL);
}
2009-07-03 02:44:48 +04:00
2009-08-18 22:43:27 +04:00
/*
* Walk forward through sufficiently contiguous I/Os.
*/
2009-02-18 23:51:31 +03:00
while ((dio = AVL_NEXT(t, lio)) != NULL &&
2009-01-16 00:59:39 +03:00
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
2009-08-18 22:43:27 +04:00
IO_SPAN(fio, dio) <= maxspan &&
IO_GAP(lio, dio) <= maxgap) {
2009-01-16 00:59:39 +03:00
lio = dio;
2009-08-18 22:43:27 +04:00
if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
mio = lio;
}
/*
* Now that we've established the range of the I/O aggregation
* we must decide what to do with trailing optional I/Os.
* For reads, there's nothing to do. While we are unable to
* aggregate further, it's possible that a trailing optional
* I/O would allow the underlying device to aggregate with
* subsequent I/Os. We must therefore determine if the next
* non-optional I/O is close enough to make aggregation
* worthwhile.
*/
stretch = B_FALSE;
if (t != &vq->vq_read_tree && mio != NULL) {
nio = lio;
while ((dio = AVL_NEXT(t, nio)) != NULL &&
IO_GAP(nio, dio) == 0 &&
IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
nio = dio;
if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
stretch = B_TRUE;
break;
}
}
}
if (stretch) {
/* This may be a no-op. */
VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
} else {
while (lio != mio && lio != fio) {
ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
lio = AVL_PREV(t, lio);
ASSERT(lio != NULL);
}
}
2008-11-20 23:01:55 +03:00
}
if (fio != lio) {
2009-07-03 02:44:48 +04:00
uint64_t size = IO_SPAN(fio, lio);
ASSERT(size <= maxspan);
ASSERT(vi != NULL);
2008-11-20 23:01:55 +03:00
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
vi, size, fio->io_type, ZIO_PRIORITY_AGG,
2009-01-16 00:59:39 +03:00
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
2008-11-20 23:01:55 +03:00
vdev_queue_agg_io_done, NULL);
aio->io_timestamp = fio->io_timestamp;
2008-11-20 23:01:55 +03:00
2009-07-03 02:44:48 +04:00
nio = fio;
do {
dio = nio;
nio = AVL_NEXT(t, dio);
2008-11-20 23:01:55 +03:00
ASSERT(dio->io_type == aio->io_type);
2009-02-18 23:51:31 +03:00
ASSERT(dio->io_vdev_tree == t);
2009-08-18 22:43:27 +04:00
if (dio->io_flags & ZIO_FLAG_NODATA) {
ASSERT(dio->io_type == ZIO_TYPE_WRITE);
bzero((char *)aio->io_data + (dio->io_offset -
aio->io_offset), dio->io_size);
} else if (dio->io_type == ZIO_TYPE_WRITE) {
2009-02-18 23:51:31 +03:00
bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset),
dio->io_size);
2009-08-18 22:43:27 +04:00
}
2009-02-18 23:51:31 +03:00
zio_add_child(dio, aio);
2008-11-20 23:01:55 +03:00
vdev_queue_io_remove(vq, dio);
zio_vdev_io_bypass(dio);
2009-02-18 23:51:31 +03:00
zio_execute(dio);
2009-07-03 02:44:48 +04:00
} while (dio != lio);
2008-11-20 23:01:55 +03:00
vdev_queue_pending_add(vq, aio);
list_remove(&vq->vq_io_list, vi);
2008-11-20 23:01:55 +03:00
return (aio);
}
2009-02-18 23:51:31 +03:00
ASSERT(fio->io_vdev_tree == t);
2008-11-20 23:01:55 +03:00
vdev_queue_io_remove(vq, fio);
2009-08-18 22:43:27 +04:00
/*
* If the I/O is or was optional and therefore has no data, we need to
* simply discard it. We need to drop the vdev queue's lock to avoid a
* deadlock that we could encounter since this I/O will complete
* immediately.
*/
if (fio->io_flags & ZIO_FLAG_NODATA) {
mutex_exit(&vq->vq_lock);
zio_vdev_io_bypass(fio);
zio_execute(fio);
mutex_enter(&vq->vq_lock);
goto again;
}
vdev_queue_pending_add(vq, fio);
2008-11-20 23:01:55 +03:00
return (fio);
}
zio_t *
vdev_queue_io(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
return (zio);
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
if (zio->io_type == ZIO_TYPE_READ)
zio->io_vdev_tree = &vq->vq_read_tree;
else
zio->io_vdev_tree = &vq->vq_write_tree;
mutex_enter(&vq->vq_lock);
Illumos #3618 ::zio dcmd does not show timestamp data 3618 ::zio dcmd does not show timestamp data Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <gwilson@zfsmail.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Dan McDonald <danmcd@nexenta.com> References: http://www.illumos.org/issues/3618 illumos/illumos-gate@c55e05cb35da47582b7afd38734d2f0d9c6deb40 Notes on porting to ZFS on Linux: The original changeset mostly deals with mdb ::zio dcmd. However, in order to provide the requested functionality it modifies vdev and zio structures to keep the timing data in nanoseconds instead of ticks. It is these changes that are ported over in the commit in hand. One visible change of this commit is that the default value of 'zfs_vdev_time_shift' tunable is changed: zfs_vdev_time_shift = 6 to zfs_vdev_time_shift = 29 The original value of 6 was inherited from OpenSolaris and was subotimal - since it shifted the raw tick value - it didn't compensate for different tick frequencies on Linux and OpenSolaris. The former has HZ=1000, while the latter HZ=100. (Which itself led to other interesting performance anomalies under non-trivial load. The deadline scheduler delays the IO according to its priority - the lower priority the further the deadline is set. The delay is measured in units of "shifted ticks". Since the HZ value was 10 times higher, the delay units were 10 times shorter. Thus really low priority IO like resilver (delay is 10 units) and scrub (delay is 20 units) were scheduled much sooner than intended. The overall effect is that resilver and scrub IO consumed more bandwidth at the expense of the other IO.) Now that the bookkeeping is done is nanoseconds the shift behaves correctly for any tick frequency (HZ). Ported-by: Cyril Plisko <cyril.plisko@mountall.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1643
2013-03-22 02:47:36 +04:00
zio->io_timestamp = gethrtime();
zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
zio->io_priority;
2008-11-20 23:01:55 +03:00
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
mutex_exit(&vq->vq_lock);
if (nio == NULL)
return (NULL);
if (nio->io_done == vdev_queue_agg_io_done) {
zio_nowait(nio);
return (NULL);
}
return (nio);
}
void
vdev_queue_io_done(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
int i;
2008-11-20 23:01:55 +03:00
if (zio_injection_enabled)
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
2008-11-20 23:01:55 +03:00
mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);
2008-11-20 23:01:55 +03:00
Illumos #3618 ::zio dcmd does not show timestamp data 3618 ::zio dcmd does not show timestamp data Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <gwilson@zfsmail.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Dan McDonald <danmcd@nexenta.com> References: http://www.illumos.org/issues/3618 illumos/illumos-gate@c55e05cb35da47582b7afd38734d2f0d9c6deb40 Notes on porting to ZFS on Linux: The original changeset mostly deals with mdb ::zio dcmd. However, in order to provide the requested functionality it modifies vdev and zio structures to keep the timing data in nanoseconds instead of ticks. It is these changes that are ported over in the commit in hand. One visible change of this commit is that the default value of 'zfs_vdev_time_shift' tunable is changed: zfs_vdev_time_shift = 6 to zfs_vdev_time_shift = 29 The original value of 6 was inherited from OpenSolaris and was subotimal - since it shifted the raw tick value - it didn't compensate for different tick frequencies on Linux and OpenSolaris. The former has HZ=1000, while the latter HZ=100. (Which itself led to other interesting performance anomalies under non-trivial load. The deadline scheduler delays the IO according to its priority - the lower priority the further the deadline is set. The delay is measured in units of "shifted ticks". Since the HZ value was 10 times higher, the delay units were 10 times shorter. Thus really low priority IO like resilver (delay is 10 units) and scrub (delay is 20 units) were scheduled much sooner than intended. The overall effect is that resilver and scrub IO consumed more bandwidth at the expense of the other IO.) Now that the bookkeeping is done is nanoseconds the shift behaves correctly for any tick frequency (HZ). Ported-by: Cyril Plisko <cyril.plisko@mountall.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1643
2013-03-22 02:47:36 +04:00
zio->io_delta = gethrtime() - zio->io_timestamp;
vq->vq_io_complete_ts = gethrtime();
vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
for (i = 0; i < zfs_vdev_ramp_rate; i++) {
zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
2008-11-20 23:01:55 +03:00
if (nio == NULL)
break;
mutex_exit(&vq->vq_lock);
if (nio->io_done == vdev_queue_agg_io_done) {
zio_nowait(nio);
} else {
zio_vdev_io_reissue(nio);
zio_execute(nio);
}
mutex_enter(&vq->vq_lock);
}
mutex_exit(&vq->vq_lock);
}
#if defined(_KERNEL) && defined(HAVE_SPL)
module_param(zfs_vdev_max_pending, int, 0644);
Add missing ZFS tunables This commit adds module options for all existing zfs tunables. Ideally the average user should never need to modify any of these values. However, in practice sometimes you do need to tweak these values for one reason or another. In those cases it's nice not to have to resort to rebuilding from source. All tunables are visable to modinfo and the list is as follows: $ modinfo module/zfs/zfs.ko filename: module/zfs/zfs.ko license: CDDL author: Sun Microsystems/Oracle, Lawrence Livermore National Laboratory description: ZFS srcversion: 8EAB1D71DACE05B5AA61567 depends: spl,znvpair,zcommon,zunicode,zavl vermagic: 2.6.32-131.0.5.el6.x86_64 SMP mod_unload modversions parm: zvol_major:Major number for zvol device (uint) parm: zvol_threads:Number of threads for zvol device (uint) parm: zio_injection_enabled:Enable fault injection (int) parm: zio_bulk_flags:Additional flags to pass to bulk buffers (int) parm: zio_delay_max:Max zio millisec delay before posting event (int) parm: zio_requeue_io_start_cut_in_line:Prioritize requeued I/O (bool) parm: zil_replay_disable:Disable intent logging replay (int) parm: zfs_nocacheflush:Disable cache flushes (bool) parm: zfs_read_chunk_size:Bytes to read per chunk (long) parm: zfs_vdev_max_pending:Max pending per-vdev I/Os (int) parm: zfs_vdev_min_pending:Min pending per-vdev I/Os (int) parm: zfs_vdev_aggregation_limit:Max vdev I/O aggregation size (int) parm: zfs_vdev_time_shift:Deadline time shift for vdev I/O (int) parm: zfs_vdev_ramp_rate:Exponential I/O issue ramp-up rate (int) parm: zfs_vdev_read_gap_limit:Aggregate read I/O over gap (int) parm: zfs_vdev_write_gap_limit:Aggregate write I/O over gap (int) parm: zfs_vdev_scheduler:I/O scheduler (charp) parm: zfs_vdev_cache_max:Inflate reads small than max (int) parm: zfs_vdev_cache_size:Total size of the per-disk cache (int) parm: zfs_vdev_cache_bshift:Shift size to inflate reads too (int) parm: zfs_scrub_limit:Max scrub/resilver I/O per leaf vdev (int) parm: zfs_recover:Set to attempt to recover from fatal errors (int) parm: spa_config_path:SPA config file (/etc/zfs/zpool.cache) (charp) parm: zfs_zevent_len_max:Max event queue length (int) parm: zfs_zevent_cols:Max event column width (int) parm: zfs_zevent_console:Log events to the console (int) parm: zfs_top_maxinflight:Max I/Os per top-level (int) parm: zfs_resilver_delay:Number of ticks to delay resilver (int) parm: zfs_scrub_delay:Number of ticks to delay scrub (int) parm: zfs_scan_idle:Idle window in clock ticks (int) parm: zfs_scan_min_time_ms:Min millisecs to scrub per txg (int) parm: zfs_free_min_time_ms:Min millisecs to free per txg (int) parm: zfs_resilver_min_time_ms:Min millisecs to resilver per txg (int) parm: zfs_no_scrub_io:Set to disable scrub I/O (bool) parm: zfs_no_scrub_prefetch:Set to disable scrub prefetching (bool) parm: zfs_txg_timeout:Max seconds worth of delta per txg (int) parm: zfs_no_write_throttle:Disable write throttling (int) parm: zfs_write_limit_shift:log2(fraction of memory) per txg (int) parm: zfs_txg_synctime_ms:Target milliseconds between tgx sync (int) parm: zfs_write_limit_min:Min tgx write limit (ulong) parm: zfs_write_limit_max:Max tgx write limit (ulong) parm: zfs_write_limit_inflated:Inflated tgx write limit (ulong) parm: zfs_write_limit_override:Override tgx write limit (ulong) parm: zfs_prefetch_disable:Disable all ZFS prefetching (int) parm: zfetch_max_streams:Max number of streams per zfetch (uint) parm: zfetch_min_sec_reap:Min time before stream reclaim (uint) parm: zfetch_block_cap:Max number of blocks to fetch at a time (uint) parm: zfetch_array_rd_sz:Number of bytes in a array_read (ulong) parm: zfs_pd_blks_max:Max number of blocks to prefetch (int) parm: zfs_dedup_prefetch:Enable prefetching dedup-ed blks (int) parm: zfs_arc_min:Min arc size (ulong) parm: zfs_arc_max:Max arc size (ulong) parm: zfs_arc_meta_limit:Meta limit for arc size (ulong) parm: zfs_arc_reduce_dnlc_percent:Meta reclaim percentage (int) parm: zfs_arc_grow_retry:Seconds before growing arc size (int) parm: zfs_arc_shrink_shift:log2(fraction of arc to reclaim) (int) parm: zfs_arc_p_min_shift:arc_c shift to calc min/max arc_p (int)
2011-05-04 02:09:28 +04:00
MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os");
module_param(zfs_vdev_min_pending, int, 0644);
Add missing ZFS tunables This commit adds module options for all existing zfs tunables. Ideally the average user should never need to modify any of these values. However, in practice sometimes you do need to tweak these values for one reason or another. In those cases it's nice not to have to resort to rebuilding from source. All tunables are visable to modinfo and the list is as follows: $ modinfo module/zfs/zfs.ko filename: module/zfs/zfs.ko license: CDDL author: Sun Microsystems/Oracle, Lawrence Livermore National Laboratory description: ZFS srcversion: 8EAB1D71DACE05B5AA61567 depends: spl,znvpair,zcommon,zunicode,zavl vermagic: 2.6.32-131.0.5.el6.x86_64 SMP mod_unload modversions parm: zvol_major:Major number for zvol device (uint) parm: zvol_threads:Number of threads for zvol device (uint) parm: zio_injection_enabled:Enable fault injection (int) parm: zio_bulk_flags:Additional flags to pass to bulk buffers (int) parm: zio_delay_max:Max zio millisec delay before posting event (int) parm: zio_requeue_io_start_cut_in_line:Prioritize requeued I/O (bool) parm: zil_replay_disable:Disable intent logging replay (int) parm: zfs_nocacheflush:Disable cache flushes (bool) parm: zfs_read_chunk_size:Bytes to read per chunk (long) parm: zfs_vdev_max_pending:Max pending per-vdev I/Os (int) parm: zfs_vdev_min_pending:Min pending per-vdev I/Os (int) parm: zfs_vdev_aggregation_limit:Max vdev I/O aggregation size (int) parm: zfs_vdev_time_shift:Deadline time shift for vdev I/O (int) parm: zfs_vdev_ramp_rate:Exponential I/O issue ramp-up rate (int) parm: zfs_vdev_read_gap_limit:Aggregate read I/O over gap (int) parm: zfs_vdev_write_gap_limit:Aggregate write I/O over gap (int) parm: zfs_vdev_scheduler:I/O scheduler (charp) parm: zfs_vdev_cache_max:Inflate reads small than max (int) parm: zfs_vdev_cache_size:Total size of the per-disk cache (int) parm: zfs_vdev_cache_bshift:Shift size to inflate reads too (int) parm: zfs_scrub_limit:Max scrub/resilver I/O per leaf vdev (int) parm: zfs_recover:Set to attempt to recover from fatal errors (int) parm: spa_config_path:SPA config file (/etc/zfs/zpool.cache) (charp) parm: zfs_zevent_len_max:Max event queue length (int) parm: zfs_zevent_cols:Max event column width (int) parm: zfs_zevent_console:Log events to the console (int) parm: zfs_top_maxinflight:Max I/Os per top-level (int) parm: zfs_resilver_delay:Number of ticks to delay resilver (int) parm: zfs_scrub_delay:Number of ticks to delay scrub (int) parm: zfs_scan_idle:Idle window in clock ticks (int) parm: zfs_scan_min_time_ms:Min millisecs to scrub per txg (int) parm: zfs_free_min_time_ms:Min millisecs to free per txg (int) parm: zfs_resilver_min_time_ms:Min millisecs to resilver per txg (int) parm: zfs_no_scrub_io:Set to disable scrub I/O (bool) parm: zfs_no_scrub_prefetch:Set to disable scrub prefetching (bool) parm: zfs_txg_timeout:Max seconds worth of delta per txg (int) parm: zfs_no_write_throttle:Disable write throttling (int) parm: zfs_write_limit_shift:log2(fraction of memory) per txg (int) parm: zfs_txg_synctime_ms:Target milliseconds between tgx sync (int) parm: zfs_write_limit_min:Min tgx write limit (ulong) parm: zfs_write_limit_max:Max tgx write limit (ulong) parm: zfs_write_limit_inflated:Inflated tgx write limit (ulong) parm: zfs_write_limit_override:Override tgx write limit (ulong) parm: zfs_prefetch_disable:Disable all ZFS prefetching (int) parm: zfetch_max_streams:Max number of streams per zfetch (uint) parm: zfetch_min_sec_reap:Min time before stream reclaim (uint) parm: zfetch_block_cap:Max number of blocks to fetch at a time (uint) parm: zfetch_array_rd_sz:Number of bytes in a array_read (ulong) parm: zfs_pd_blks_max:Max number of blocks to prefetch (int) parm: zfs_dedup_prefetch:Enable prefetching dedup-ed blks (int) parm: zfs_arc_min:Min arc size (ulong) parm: zfs_arc_max:Max arc size (ulong) parm: zfs_arc_meta_limit:Meta limit for arc size (ulong) parm: zfs_arc_reduce_dnlc_percent:Meta reclaim percentage (int) parm: zfs_arc_grow_retry:Seconds before growing arc size (int) parm: zfs_arc_shrink_shift:log2(fraction of arc to reclaim) (int) parm: zfs_arc_p_min_shift:arc_c shift to calc min/max arc_p (int)
2011-05-04 02:09:28 +04:00
MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os");
module_param(zfs_vdev_aggregation_limit, int, 0644);
Add missing ZFS tunables This commit adds module options for all existing zfs tunables. Ideally the average user should never need to modify any of these values. However, in practice sometimes you do need to tweak these values for one reason or another. In those cases it's nice not to have to resort to rebuilding from source. All tunables are visable to modinfo and the list is as follows: $ modinfo module/zfs/zfs.ko filename: module/zfs/zfs.ko license: CDDL author: Sun Microsystems/Oracle, Lawrence Livermore National Laboratory description: ZFS srcversion: 8EAB1D71DACE05B5AA61567 depends: spl,znvpair,zcommon,zunicode,zavl vermagic: 2.6.32-131.0.5.el6.x86_64 SMP mod_unload modversions parm: zvol_major:Major number for zvol device (uint) parm: zvol_threads:Number of threads for zvol device (uint) parm: zio_injection_enabled:Enable fault injection (int) parm: zio_bulk_flags:Additional flags to pass to bulk buffers (int) parm: zio_delay_max:Max zio millisec delay before posting event (int) parm: zio_requeue_io_start_cut_in_line:Prioritize requeued I/O (bool) parm: zil_replay_disable:Disable intent logging replay (int) parm: zfs_nocacheflush:Disable cache flushes (bool) parm: zfs_read_chunk_size:Bytes to read per chunk (long) parm: zfs_vdev_max_pending:Max pending per-vdev I/Os (int) parm: zfs_vdev_min_pending:Min pending per-vdev I/Os (int) parm: zfs_vdev_aggregation_limit:Max vdev I/O aggregation size (int) parm: zfs_vdev_time_shift:Deadline time shift for vdev I/O (int) parm: zfs_vdev_ramp_rate:Exponential I/O issue ramp-up rate (int) parm: zfs_vdev_read_gap_limit:Aggregate read I/O over gap (int) parm: zfs_vdev_write_gap_limit:Aggregate write I/O over gap (int) parm: zfs_vdev_scheduler:I/O scheduler (charp) parm: zfs_vdev_cache_max:Inflate reads small than max (int) parm: zfs_vdev_cache_size:Total size of the per-disk cache (int) parm: zfs_vdev_cache_bshift:Shift size to inflate reads too (int) parm: zfs_scrub_limit:Max scrub/resilver I/O per leaf vdev (int) parm: zfs_recover:Set to attempt to recover from fatal errors (int) parm: spa_config_path:SPA config file (/etc/zfs/zpool.cache) (charp) parm: zfs_zevent_len_max:Max event queue length (int) parm: zfs_zevent_cols:Max event column width (int) parm: zfs_zevent_console:Log events to the console (int) parm: zfs_top_maxinflight:Max I/Os per top-level (int) parm: zfs_resilver_delay:Number of ticks to delay resilver (int) parm: zfs_scrub_delay:Number of ticks to delay scrub (int) parm: zfs_scan_idle:Idle window in clock ticks (int) parm: zfs_scan_min_time_ms:Min millisecs to scrub per txg (int) parm: zfs_free_min_time_ms:Min millisecs to free per txg (int) parm: zfs_resilver_min_time_ms:Min millisecs to resilver per txg (int) parm: zfs_no_scrub_io:Set to disable scrub I/O (bool) parm: zfs_no_scrub_prefetch:Set to disable scrub prefetching (bool) parm: zfs_txg_timeout:Max seconds worth of delta per txg (int) parm: zfs_no_write_throttle:Disable write throttling (int) parm: zfs_write_limit_shift:log2(fraction of memory) per txg (int) parm: zfs_txg_synctime_ms:Target milliseconds between tgx sync (int) parm: zfs_write_limit_min:Min tgx write limit (ulong) parm: zfs_write_limit_max:Max tgx write limit (ulong) parm: zfs_write_limit_inflated:Inflated tgx write limit (ulong) parm: zfs_write_limit_override:Override tgx write limit (ulong) parm: zfs_prefetch_disable:Disable all ZFS prefetching (int) parm: zfetch_max_streams:Max number of streams per zfetch (uint) parm: zfetch_min_sec_reap:Min time before stream reclaim (uint) parm: zfetch_block_cap:Max number of blocks to fetch at a time (uint) parm: zfetch_array_rd_sz:Number of bytes in a array_read (ulong) parm: zfs_pd_blks_max:Max number of blocks to prefetch (int) parm: zfs_dedup_prefetch:Enable prefetching dedup-ed blks (int) parm: zfs_arc_min:Min arc size (ulong) parm: zfs_arc_max:Max arc size (ulong) parm: zfs_arc_meta_limit:Meta limit for arc size (ulong) parm: zfs_arc_reduce_dnlc_percent:Meta reclaim percentage (int) parm: zfs_arc_grow_retry:Seconds before growing arc size (int) parm: zfs_arc_shrink_shift:log2(fraction of arc to reclaim) (int) parm: zfs_arc_p_min_shift:arc_c shift to calc min/max arc_p (int)
2011-05-04 02:09:28 +04:00
MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size");
module_param(zfs_vdev_time_shift, int, 0644);
MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O");
module_param(zfs_vdev_ramp_rate, int, 0644);
MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate");
module_param(zfs_vdev_read_gap_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
module_param(zfs_vdev_write_gap_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap");
#endif