mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
OpenZFS 6531 - Provide mechanism to artificially limit disk performance
Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Dan McDonald <danmcd@omniti.com> Ported by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/6531 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/97e8130 Porting notes: - Added new IO delay tracepoints, and moved common ZIO tracepoint macros to a new trace_common.h file. - Used zio_delay_taskq() in place of OpenZFS's timeout_generic() function. - Updated zinject man page - Updated zpool_scrub test files
This commit is contained in:
committed by
Brian Behlendorf
parent
7e945072d1
commit
26ef0cc7db
@@ -47,4 +47,5 @@
|
||||
#include <sys/trace_multilist.h>
|
||||
#include <sys/trace_txg.h>
|
||||
#include <sys/trace_zil.h>
|
||||
#include <sys/trace_zio.h>
|
||||
#include <sys/trace_zrlock.h>
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
|
||||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
||||
* LLNL-CODE-403049.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@@ -414,7 +414,7 @@ vdev_disk_dio_put(dio_request_t *dr)
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
zio_interrupt(zio);
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -726,6 +726,7 @@ vdev_disk_io_start(zio_t *zio)
|
||||
return;
|
||||
}
|
||||
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
|
||||
zio->io_size, zio->io_offset, flags, 0);
|
||||
if (error) {
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@@ -159,7 +159,7 @@ vdev_file_io_strategy(void *arg)
|
||||
if (resid != 0 && zio->io_error == 0)
|
||||
zio->io_error = SET_ERROR(ENOSPC);
|
||||
|
||||
zio_interrupt(zio);
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -217,6 +217,8 @@ vdev_file_io_start(zio_t *zio)
|
||||
return;
|
||||
}
|
||||
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
|
||||
VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio,
|
||||
TQ_SLEEP), !=, 0);
|
||||
}
|
||||
|
||||
@@ -753,9 +753,6 @@ vdev_queue_io_done(zio_t *zio)
|
||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||
zio_t *nio;
|
||||
|
||||
if (zio_injection_enabled)
|
||||
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
|
||||
vdev_queue_pending_remove(vq, zio);
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <sys/blkptr.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/trace_zio.h>
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
@@ -1390,6 +1391,76 @@ zio_interrupt(zio_t *zio)
|
||||
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
zio_delay_interrupt(zio_t *zio)
|
||||
{
|
||||
/*
|
||||
* The timeout_generic() function isn't defined in userspace, so
|
||||
* rather than trying to implement the function, the zio delay
|
||||
* functionality has been disabled for userspace builds.
|
||||
*/
|
||||
|
||||
#ifdef _KERNEL
|
||||
/*
|
||||
* If io_target_timestamp is zero, then no delay has been registered
|
||||
* for this IO, thus jump to the end of this function and "skip" the
|
||||
* delay; issuing it directly to the zio layer.
|
||||
*/
|
||||
if (zio->io_target_timestamp != 0) {
|
||||
hrtime_t now = gethrtime();
|
||||
|
||||
if (now >= zio->io_target_timestamp) {
|
||||
/*
|
||||
* This IO has already taken longer than the target
|
||||
* delay to complete, so we don't want to delay it
|
||||
* any longer; we "miss" the delay and issue it
|
||||
* directly to the zio layer. This is likely due to
|
||||
* the target latency being set to a value less than
|
||||
* the underlying hardware can satisfy (e.g. delay
|
||||
* set to 1ms, but the disks take 10ms to complete an
|
||||
* IO request).
|
||||
*/
|
||||
|
||||
DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
|
||||
hrtime_t, now);
|
||||
|
||||
zio_interrupt(zio);
|
||||
} else {
|
||||
taskqid_t tid;
|
||||
hrtime_t diff = zio->io_target_timestamp - now;
|
||||
clock_t expire_at_tick = ddi_get_lbolt() +
|
||||
NSEC_TO_TICK(diff);
|
||||
|
||||
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
|
||||
hrtime_t, now, hrtime_t, diff);
|
||||
|
||||
if (NSEC_TO_TICK(diff) == 0) {
|
||||
/* Our delay is less than a jiffy - just spin */
|
||||
zfs_sleep_until(zio->io_target_timestamp);
|
||||
} else {
|
||||
/*
|
||||
* Use taskq_dispatch_delay() in the place of
|
||||
* OpenZFS's timeout_generic().
|
||||
*/
|
||||
tid = taskq_dispatch_delay(system_taskq,
|
||||
(task_func_t *) zio_interrupt,
|
||||
zio, TQ_NOSLEEP, expire_at_tick);
|
||||
if (!tid) {
|
||||
/*
|
||||
* Couldn't allocate a task. Just
|
||||
* finish the zio without a delay.
|
||||
*/
|
||||
zio_interrupt(zio);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
|
||||
zio_interrupt(zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Execute the I/O pipeline until one of the following occurs:
|
||||
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
||||
|
||||
+246
-15
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -49,15 +49,53 @@
|
||||
|
||||
uint32_t zio_injection_enabled = 0;
|
||||
|
||||
/*
|
||||
* Data describing each zinject handler registered on the system, and
|
||||
* contains the list node linking the handler in the global zinject
|
||||
* handler list.
|
||||
*/
|
||||
typedef struct inject_handler {
|
||||
int zi_id;
|
||||
spa_t *zi_spa;
|
||||
zinject_record_t zi_record;
|
||||
uint64_t *zi_lanes;
|
||||
int zi_next_lane;
|
||||
list_node_t zi_link;
|
||||
} inject_handler_t;
|
||||
|
||||
/*
|
||||
* List of all zinject handlers registered on the system, protected by
|
||||
* the inject_lock defined below.
|
||||
*/
|
||||
static list_t inject_handlers;
|
||||
|
||||
/*
|
||||
* This protects insertion into, and traversal of, the inject handler
|
||||
* list defined above; as well as the inject_delay_count. Any time a
|
||||
* handler is inserted or removed from the list, this lock should be
|
||||
* taken as a RW_WRITER; and any time traversal is done over the list
|
||||
* (without modification to it) this lock should be taken as a RW_READER.
|
||||
*/
|
||||
static krwlock_t inject_lock;
|
||||
|
||||
/*
|
||||
* This holds the number of zinject delay handlers that have been
|
||||
* registered on the system. It is protected by the inject_lock defined
|
||||
* above. Thus modifications to this count must be a RW_WRITER of the
|
||||
* inject_lock, and reads of this count must be (at least) a RW_READER
|
||||
* of the lock.
|
||||
*/
|
||||
static int inject_delay_count = 0;
|
||||
|
||||
/*
|
||||
* This lock is used only in zio_handle_io_delay(), refer to the comment
|
||||
* in that function for more details.
|
||||
*/
|
||||
static kmutex_t inject_delay_mtx;
|
||||
|
||||
/*
|
||||
* Used to assign unique identifying numbers to each new zinject handler.
|
||||
*/
|
||||
static int inject_next_id = 1;
|
||||
|
||||
/*
|
||||
@@ -361,21 +399,70 @@ spa_handle_ignored_writes(spa_t *spa)
|
||||
rw_exit(&inject_lock);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
hrtime_t
|
||||
zio_handle_io_delay(zio_t *zio)
|
||||
{
|
||||
vdev_t *vd = zio->io_vd;
|
||||
inject_handler_t *min_handler = NULL;
|
||||
hrtime_t min_target = 0;
|
||||
inject_handler_t *handler;
|
||||
uint64_t seconds = 0;
|
||||
|
||||
if (zio_injection_enabled == 0)
|
||||
return (0);
|
||||
hrtime_t idle;
|
||||
hrtime_t busy;
|
||||
hrtime_t target;
|
||||
|
||||
rw_enter(&inject_lock, RW_READER);
|
||||
|
||||
for (handler = list_head(&inject_handlers); handler != NULL;
|
||||
handler = list_next(&inject_handlers, handler)) {
|
||||
/*
|
||||
* inject_delay_count is a subset of zio_injection_enabled that
|
||||
* is only incremented for delay handlers. These checks are
|
||||
* mainly added to remind the reader why we're not explicitly
|
||||
* checking zio_injection_enabled like the other functions.
|
||||
*/
|
||||
IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
|
||||
IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
|
||||
|
||||
/*
|
||||
* If there aren't any inject delay handlers registered, then we
|
||||
* can short circuit and simply return 0 here. A value of zero
|
||||
* informs zio_delay_interrupt() that this request should not be
|
||||
* delayed. This short circuit keeps us from acquiring the
|
||||
* inject_delay_mutex unnecessarily.
|
||||
*/
|
||||
if (inject_delay_count == 0) {
|
||||
rw_exit(&inject_lock);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Each inject handler has a number of "lanes" associated with
|
||||
* it. Each lane is able to handle requests independently of one
|
||||
* another, and at a latency defined by the inject handler
|
||||
* record's zi_timer field. Thus if a handler in configured with
|
||||
* a single lane with a 10ms latency, it will delay requests
|
||||
* such that only a single request is completed every 10ms. So,
|
||||
* if more than one request is attempted per each 10ms interval,
|
||||
* the average latency of the requests will be greater than
|
||||
* 10ms; but if only a single request is submitted each 10ms
|
||||
* interval the average latency will be 10ms.
|
||||
*
|
||||
* We need to acquire this mutex to prevent multiple concurrent
|
||||
* threads being assigned to the same lane of a given inject
|
||||
* handler. The mutex allows us to perform the following two
|
||||
* operations atomically:
|
||||
*
|
||||
* 1. determine the minimum handler and minimum target
|
||||
* value of all the possible handlers
|
||||
* 2. update that minimum handler's lane array
|
||||
*
|
||||
* Without atomicity, two (or more) threads could pick the same
|
||||
* lane in step (1), and then conflict with each other in step
|
||||
* (2). This could allow a single lane handler to process
|
||||
* multiple requests simultaneously, which shouldn't be possible.
|
||||
*/
|
||||
mutex_enter(&inject_delay_mtx);
|
||||
|
||||
for (handler = list_head(&inject_handlers);
|
||||
handler != NULL; handler = list_next(&inject_handlers, handler)) {
|
||||
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
|
||||
continue;
|
||||
|
||||
@@ -384,14 +471,101 @@ zio_handle_io_delay(zio_t *zio)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (vd->vdev_guid == handler->zi_record.zi_guid) {
|
||||
seconds = handler->zi_record.zi_timer;
|
||||
break;
|
||||
if (vd->vdev_guid != handler->zi_record.zi_guid)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Defensive; should never happen as the array allocation
|
||||
* occurs prior to inserting this handler on the list.
|
||||
*/
|
||||
ASSERT3P(handler->zi_lanes, !=, NULL);
|
||||
|
||||
/*
|
||||
* This should never happen, the zinject command should
|
||||
* prevent a user from setting an IO delay with zero lanes.
|
||||
*/
|
||||
ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
|
||||
|
||||
ASSERT3U(handler->zi_record.zi_nlanes, >,
|
||||
handler->zi_next_lane);
|
||||
|
||||
/*
|
||||
* We want to issue this IO to the lane that will become
|
||||
* idle the soonest, so we compare the soonest this
|
||||
* specific handler can complete the IO with all other
|
||||
* handlers, to find the lowest value of all possible
|
||||
* lanes. We then use this lane to submit the request.
|
||||
*
|
||||
* Since each handler has a constant value for its
|
||||
* delay, we can just use the "next" lane for that
|
||||
* handler; as it will always be the lane with the
|
||||
* lowest value for that particular handler (i.e. the
|
||||
* lane that will become idle the soonest). This saves a
|
||||
* scan of each handler's lanes array.
|
||||
*
|
||||
* There's two cases to consider when determining when
|
||||
* this specific IO request should complete. If this
|
||||
* lane is idle, we want to "submit" the request now so
|
||||
* it will complete after zi_timer milliseconds. Thus,
|
||||
* we set the target to now + zi_timer.
|
||||
*
|
||||
* If the lane is busy, we want this request to complete
|
||||
* zi_timer milliseconds after the lane becomes idle.
|
||||
* Since the 'zi_lanes' array holds the time at which
|
||||
* each lane will become idle, we use that value to
|
||||
* determine when this request should complete.
|
||||
*/
|
||||
idle = handler->zi_record.zi_timer + gethrtime();
|
||||
busy = handler->zi_record.zi_timer +
|
||||
handler->zi_lanes[handler->zi_next_lane];
|
||||
target = MAX(idle, busy);
|
||||
|
||||
if (min_handler == NULL) {
|
||||
min_handler = handler;
|
||||
min_target = target;
|
||||
continue;
|
||||
}
|
||||
|
||||
ASSERT3P(min_handler, !=, NULL);
|
||||
ASSERT3U(min_target, !=, 0);
|
||||
|
||||
/*
|
||||
* We don't yet increment the "next lane" variable since
|
||||
* we still might find a lower value lane in another
|
||||
* handler during any remaining iterations. Once we're
|
||||
* sure we've selected the absolute minimum, we'll claim
|
||||
* the lane and increment the handler's "next lane"
|
||||
* field below.
|
||||
*/
|
||||
|
||||
if (target < min_target) {
|
||||
min_handler = handler;
|
||||
min_target = target;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* 'min_handler' will be NULL if no IO delays are registered for
|
||||
* this vdev, otherwise it will point to the handler containing
|
||||
* the lane that will become idle the soonest.
|
||||
*/
|
||||
if (min_handler != NULL) {
|
||||
ASSERT3U(min_target, !=, 0);
|
||||
min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
|
||||
|
||||
/*
|
||||
* If we've used all possible lanes for this handler,
|
||||
* loop back and start using the first lane again;
|
||||
* otherwise, just increment the lane index.
|
||||
*/
|
||||
min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
|
||||
min_handler->zi_record.zi_nlanes;
|
||||
}
|
||||
|
||||
mutex_exit(&inject_delay_mtx);
|
||||
rw_exit(&inject_lock);
|
||||
return (seconds);
|
||||
|
||||
return (min_target);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -415,6 +589,24 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
|
||||
if ((error = spa_reset(name)) != 0)
|
||||
return (error);
|
||||
|
||||
if (record->zi_cmd == ZINJECT_DELAY_IO) {
|
||||
/*
|
||||
* A value of zero for the number of lanes or for the
|
||||
* delay time doesn't make sense.
|
||||
*/
|
||||
if (record->zi_timer == 0 || record->zi_nlanes == 0)
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
/*
|
||||
* The number of lanes is directly mapped to the size of
|
||||
* an array used by the handler. Thus, to ensure the
|
||||
* user doesn't trigger an allocation that's "too large"
|
||||
* we cap the number of lanes here.
|
||||
*/
|
||||
if (record->zi_nlanes >= UINT16_MAX)
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
if (!(flags & ZINJECT_NULL)) {
|
||||
/*
|
||||
* spa_inject_ref() will add an injection reference, which will
|
||||
@@ -426,11 +618,34 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
|
||||
|
||||
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
|
||||
|
||||
rw_enter(&inject_lock, RW_WRITER);
|
||||
|
||||
*id = handler->zi_id = inject_next_id++;
|
||||
handler->zi_spa = spa;
|
||||
handler->zi_record = *record;
|
||||
|
||||
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||
handler->zi_lanes = kmem_zalloc(
|
||||
sizeof (*handler->zi_lanes) *
|
||||
handler->zi_record.zi_nlanes, KM_SLEEP);
|
||||
handler->zi_next_lane = 0;
|
||||
} else {
|
||||
handler->zi_lanes = NULL;
|
||||
handler->zi_next_lane = 0;
|
||||
}
|
||||
|
||||
rw_enter(&inject_lock, RW_WRITER);
|
||||
|
||||
/*
|
||||
* We can't move this increment into the conditional
|
||||
* above because we need to hold the RW_WRITER lock of
|
||||
* inject_lock, and we don't want to hold that while
|
||||
* allocating the handler's zi_lanes array.
|
||||
*/
|
||||
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||
ASSERT3S(inject_delay_count, >=, 0);
|
||||
inject_delay_count++;
|
||||
ASSERT3S(inject_delay_count, >, 0);
|
||||
}
|
||||
|
||||
*id = handler->zi_id = inject_next_id++;
|
||||
list_insert_tail(&inject_handlers, handler);
|
||||
atomic_inc_32(&zio_injection_enabled);
|
||||
|
||||
@@ -508,9 +723,23 @@ zio_clear_fault(int id)
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||
ASSERT3S(inject_delay_count, >, 0);
|
||||
inject_delay_count--;
|
||||
ASSERT3S(inject_delay_count, >=, 0);
|
||||
}
|
||||
|
||||
list_remove(&inject_handlers, handler);
|
||||
rw_exit(&inject_lock);
|
||||
|
||||
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
|
||||
ASSERT3P(handler->zi_lanes, !=, NULL);
|
||||
kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
|
||||
handler->zi_record.zi_nlanes);
|
||||
} else {
|
||||
ASSERT3P(handler->zi_lanes, ==, NULL);
|
||||
}
|
||||
|
||||
spa_inject_delref(handler->zi_spa);
|
||||
kmem_free(handler, sizeof (inject_handler_t));
|
||||
atomic_dec_32(&zio_injection_enabled);
|
||||
@@ -522,6 +751,7 @@ void
|
||||
zio_inject_init(void)
|
||||
{
|
||||
rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
|
||||
mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
|
||||
list_create(&inject_handlers, sizeof (inject_handler_t),
|
||||
offsetof(inject_handler_t, zi_link));
|
||||
}
|
||||
@@ -530,6 +760,7 @@ void
|
||||
zio_inject_fini(void)
|
||||
{
|
||||
list_destroy(&inject_handlers);
|
||||
mutex_destroy(&inject_delay_mtx);
|
||||
rw_destroy(&inject_lock);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user