2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2014-04-16 07:40:22 +04:00
|
|
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
2011-11-08 04:26:52 +04:00
|
|
|
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/fm/fs/zfs.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/txg.h>
|
|
|
|
#include <sys/spa_impl.h>
|
|
|
|
#include <sys/vdev_impl.h>
|
|
|
|
#include <sys/zio_impl.h>
|
|
|
|
#include <sys/zio_compress.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/dmu_objset.h>
|
|
|
|
#include <sys/arc.h>
|
|
|
|
#include <sys/ddt.h>
|
2014-06-06 01:19:08 +04:00
|
|
|
#include <sys/blkptr.h>
|
2013-12-09 22:37:51 +04:00
|
|
|
#include <sys/zfeature.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O type descriptions
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
const char *zio_type_name[ZIO_TYPES] = {
|
2010-10-28 21:36:50 +04:00
|
|
|
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
|
2010-05-29 00:45:14 +04:00
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O kmem caches
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
kmem_cache_t *zio_cache;
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_t *zio_link_cache;
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
2010-10-02 03:54:52 +04:00
|
|
|
int zio_delay_max = ZIO_DELAY_MAX;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-05-06 21:14:52 +04:00
|
|
|
/*
|
|
|
|
* The following actions directly effect the spa's sync-to-convergence logic.
|
|
|
|
* The values below define the sync pass when we start performing the action.
|
|
|
|
* Care should be taken when changing these values as they directly impact
|
|
|
|
* spa_sync() performance. Tuning these values may introduce subtle performance
|
|
|
|
* pathologies and should only be done in the context of performance analysis.
|
|
|
|
* These tunables will eventually be removed and replaced with #defines once
|
|
|
|
* enough analysis has been done to determine optimal values.
|
|
|
|
*
|
|
|
|
* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
|
|
|
|
* regular blocks are not deferred.
|
|
|
|
*/
|
|
|
|
int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
|
|
|
|
int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
|
|
|
|
int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* An allocating zio is one that either currently has the DVA allocate
|
|
|
|
* stage set or will have it later in its lifetime.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
|
|
|
|
|
2011-05-04 02:09:28 +04:00
|
|
|
int zio_requeue_io_start_cut_in_line = 1;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
int zio_buf_debug_limit = 16384;
|
|
|
|
#else
|
|
|
|
int zio_buf_debug_limit = 0;
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
static inline void __zio_execute(zio_t *zio);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
|
|
|
zio_init(void)
|
|
|
|
{
|
|
|
|
size_t c;
|
|
|
|
vmem_t *data_alloc_arena = NULL;
|
|
|
|
|
2015-01-30 22:25:19 +03:00
|
|
|
zio_cache = kmem_cache_create("zio_cache",
|
|
|
|
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_link_cache = kmem_cache_create("zio_link_cache",
|
2014-05-15 05:17:39 +04:00
|
|
|
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For small buffers, we want a cache for each multiple of
|
|
|
|
* SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
|
|
|
|
* for each quarter-power of 2. For large buffers, we want
|
|
|
|
* a cache for each multiple of PAGESIZE.
|
|
|
|
*/
|
|
|
|
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
|
|
|
|
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
|
|
|
|
size_t p2 = size;
|
|
|
|
size_t align = 0;
|
2015-02-07 00:37:02 +03:00
|
|
|
size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
while (p2 & (p2 - 1))
|
|
|
|
p2 &= p2 - 1;
|
|
|
|
|
2013-05-17 01:18:06 +04:00
|
|
|
#ifndef _KERNEL
|
|
|
|
/*
|
|
|
|
* If we are using watchpoints, put each buffer on its own page,
|
|
|
|
* to eliminate the performance overhead of trapping to the
|
|
|
|
* kernel when modifying a non-watched buffer that shares the
|
|
|
|
* page with a watched buffer.
|
|
|
|
*/
|
|
|
|
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
|
|
|
|
continue;
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
if (size <= 4 * SPA_MINBLOCKSIZE) {
|
|
|
|
align = SPA_MINBLOCKSIZE;
|
2013-05-17 01:18:06 +04:00
|
|
|
} else if (IS_P2ALIGNED(size, PAGESIZE)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
align = PAGESIZE;
|
2013-05-17 01:18:06 +04:00
|
|
|
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
align = p2 >> 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (align != 0) {
|
|
|
|
char name[36];
|
|
|
|
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
|
|
|
|
zio_buf_cache[c] = kmem_cache_create(name, size,
|
2015-02-07 00:37:02 +03:00
|
|
|
align, NULL, NULL, NULL, NULL, NULL, cflags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
|
|
|
|
zio_data_buf_cache[c] = kmem_cache_create(name, size,
|
2011-11-02 03:56:48 +04:00
|
|
|
align, NULL, NULL, NULL, NULL,
|
2015-02-07 00:37:02 +03:00
|
|
|
data_alloc_arena, cflags);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (--c != 0) {
|
|
|
|
ASSERT(zio_buf_cache[c] != NULL);
|
|
|
|
if (zio_buf_cache[c - 1] == NULL)
|
|
|
|
zio_buf_cache[c - 1] = zio_buf_cache[c];
|
|
|
|
|
|
|
|
ASSERT(zio_data_buf_cache[c] != NULL);
|
|
|
|
if (zio_data_buf_cache[c - 1] == NULL)
|
|
|
|
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_inject_init();
|
2013-01-23 13:54:30 +04:00
|
|
|
|
|
|
|
lz4_init();
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_fini(void)
|
|
|
|
{
|
|
|
|
size_t c;
|
|
|
|
kmem_cache_t *last_cache = NULL;
|
|
|
|
kmem_cache_t *last_data_cache = NULL;
|
|
|
|
|
|
|
|
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
|
|
|
|
if (zio_buf_cache[c] != last_cache) {
|
|
|
|
last_cache = zio_buf_cache[c];
|
|
|
|
kmem_cache_destroy(zio_buf_cache[c]);
|
|
|
|
}
|
|
|
|
zio_buf_cache[c] = NULL;
|
|
|
|
|
|
|
|
if (zio_data_buf_cache[c] != last_data_cache) {
|
|
|
|
last_data_cache = zio_data_buf_cache[c];
|
|
|
|
kmem_cache_destroy(zio_data_buf_cache[c]);
|
|
|
|
}
|
|
|
|
zio_data_buf_cache[c] = NULL;
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_destroy(zio_link_cache);
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_destroy(zio_cache);
|
|
|
|
|
|
|
|
zio_inject_fini();
|
2013-01-23 13:54:30 +04:00
|
|
|
|
|
|
|
lz4_fini();
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Allocate and free I/O buffers
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
|
|
|
|
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
|
|
|
|
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
|
|
|
|
* excess / transient data in-core during a crashdump.
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zio_buf_alloc(size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-12-03 22:56:32 +03:00
|
|
|
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
|
|
|
|
* crashdump if the kernel panics. This exists so that we will limit the amount
|
|
|
|
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
|
|
|
|
* of kernel heap dumped to disk when the kernel panics)
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zio_data_buf_alloc(size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
|
|
|
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
|
|
|
|
2014-12-03 22:56:32 +03:00
|
|
|
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_buf_free(void *buf, size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
|
|
|
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
|
|
|
|
|
|
|
kmem_cache_free(zio_buf_cache[c], buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_data_buf_free(void *buf, size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
|
|
|
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
|
|
|
|
|
|
|
kmem_cache_free(zio_data_buf_cache[c], buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Push and pop I/O transform buffers
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
|
|
|
|
zio_transform_func_t *transform)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2014-11-21 03:09:39 +03:00
|
|
|
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zt->zt_orig_data = zio->io_data;
|
|
|
|
zt->zt_orig_size = zio->io_size;
|
2008-11-20 23:01:55 +03:00
|
|
|
zt->zt_bufsize = bufsize;
|
2008-12-03 23:09:06 +03:00
|
|
|
zt->zt_transform = transform;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zt->zt_next = zio->io_transform_stack;
|
|
|
|
zio->io_transform_stack = zt;
|
|
|
|
|
|
|
|
zio->io_data = data;
|
|
|
|
zio->io_size = size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_pop_transforms(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_transform_t *zt;
|
|
|
|
|
|
|
|
while ((zt = zio->io_transform_stack) != NULL) {
|
|
|
|
if (zt->zt_transform != NULL)
|
|
|
|
zt->zt_transform(zio,
|
|
|
|
zt->zt_orig_data, zt->zt_orig_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zt->zt_bufsize != 0)
|
|
|
|
zio_buf_free(zio->io_data, zt->zt_bufsize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_data = zt->zt_orig_data;
|
|
|
|
zio->io_size = zt->zt_orig_size;
|
|
|
|
zio->io_transform_stack = zt->zt_next;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
kmem_free(zt, sizeof (zio_transform_t));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O transform callbacks for subblocks and decompression
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_subblock(zio_t *zio, void *data, uint64_t size)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_size > size);
|
|
|
|
|
|
|
|
if (zio->io_type == ZIO_TYPE_READ)
|
|
|
|
bcopy(zio->io_data, data, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_decompress(zio_t *zio, void *data, uint64_t size)
|
|
|
|
{
|
|
|
|
if (zio->io_error == 0 &&
|
|
|
|
zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_data, data, zio->io_size, size) != 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(EIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O parent/child relationships and pipeline interlocks
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2009-02-18 23:51:31 +03:00
|
|
|
/*
|
|
|
|
* NOTE - Callers to zio_walk_parents() and zio_walk_children must
|
|
|
|
* continue calling these functions until they return NULL.
|
|
|
|
* Otherwise, the next caller will pick up the list walk in
|
|
|
|
* some indeterminate state. (Otherwise every caller would
|
|
|
|
* have to pass in a cookie to keep the state represented by
|
|
|
|
* io_walk_link, which gets annoying.)
|
|
|
|
*/
|
|
|
|
zio_t *
|
|
|
|
zio_walk_parents(zio_t *cio)
|
|
|
|
{
|
|
|
|
zio_link_t *zl = cio->io_walk_link;
|
|
|
|
list_t *pl = &cio->io_parent_list;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
|
|
|
|
cio->io_walk_link = zl;
|
|
|
|
|
|
|
|
if (zl == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
ASSERT(zl->zl_child == cio);
|
|
|
|
return (zl->zl_parent);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_walk_children(zio_t *pio)
|
|
|
|
{
|
|
|
|
zio_link_t *zl = pio->io_walk_link;
|
|
|
|
list_t *cl = &pio->io_child_list;
|
|
|
|
|
|
|
|
zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
|
|
|
|
pio->io_walk_link = zl;
|
|
|
|
|
|
|
|
if (zl == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
ASSERT(zl->zl_parent == pio);
|
|
|
|
return (zl->zl_child);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_unique_parent(zio_t *cio)
|
|
|
|
{
|
|
|
|
zio_t *pio = zio_walk_parents(cio);
|
|
|
|
|
|
|
|
VERIFY(zio_walk_parents(cio) == NULL);
|
|
|
|
return (pio);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_add_child(zio_t *pio, zio_t *cio)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2014-11-21 03:09:39 +03:00
|
|
|
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
|
2010-08-26 20:52:39 +04:00
|
|
|
int w;
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Logical I/Os can have logical, gang, or vdev children.
|
|
|
|
* Gang I/Os can have gang or vdev children.
|
|
|
|
* Vdev I/Os can only have vdev children.
|
|
|
|
* The following ASSERT captures all of these constraints.
|
|
|
|
*/
|
|
|
|
ASSERT(cio->io_child_type <= pio->io_child_type);
|
|
|
|
|
|
|
|
zl->zl_parent = pio;
|
|
|
|
zl->zl_child = cio;
|
|
|
|
|
|
|
|
mutex_enter(&cio->io_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
|
|
|
|
|
|
|
|
list_insert_head(&pio->io_child_list, zl);
|
|
|
|
list_insert_head(&cio->io_parent_list, zl);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
pio->io_child_count++;
|
|
|
|
cio->io_parent_count++;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&cio->io_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
ASSERT(zl->zl_parent == pio);
|
|
|
|
ASSERT(zl->zl_child == cio);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&cio->io_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
list_remove(&pio->io_child_list, zl);
|
|
|
|
list_remove(&cio->io_parent_list, zl);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
pio->io_child_count--;
|
|
|
|
cio->io_parent_count--;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&cio->io_lock);
|
|
|
|
|
|
|
|
kmem_cache_free(zio_link_cache, zl);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
|
|
|
zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
uint64_t *countp = &zio->io_children[child][wait];
|
|
|
|
boolean_t waiting = B_FALSE;
|
|
|
|
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
ASSERT(zio->io_stall == NULL);
|
|
|
|
if (*countp != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_stall = countp;
|
|
|
|
waiting = B_TRUE;
|
|
|
|
}
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
|
|
|
return (waiting);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:58:00 +04:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
|
|
|
|
{
|
|
|
|
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
|
|
|
|
int *errorp = &pio->io_child_error[zio->io_child_type];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
|
|
|
|
*errorp = zio_worst_error(*errorp, zio->io_error);
|
|
|
|
pio->io_reexecute |= zio->io_reexecute;
|
|
|
|
ASSERT3U(*countp, >, 0);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
|
|
|
|
(*countp)--;
|
|
|
|
|
|
|
|
if (*countp == 0 && pio->io_stall == countp) {
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_stall = NULL;
|
|
|
|
mutex_exit(&pio->io_lock);
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(pio);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
mutex_exit(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_inherit_child_errors(zio_t *zio, enum zio_child c)
|
|
|
|
{
|
|
|
|
if (zio->io_child_error[c] != 0 && zio->io_error == 0)
|
|
|
|
zio->io_error = zio->io_child_error[c];
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Create the various types of I/O (read, write, free, etc)
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
2008-11-20 23:01:55 +03:00
|
|
|
void *data, uint64_t size, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_type_t type, zio_priority_t priority, enum zio_flag flags,
|
2014-06-25 22:37:59 +04:00
|
|
|
vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage stage, enum zio_stage pipeline)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
|
|
|
|
ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
|
|
|
|
|
|
|
|
ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
|
|
|
|
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
|
|
|
|
ASSERT(vd || stage == ZIO_STAGE_OPEN);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
|
2015-01-30 22:25:19 +03:00
|
|
|
bzero(zio, sizeof (zio_t));
|
|
|
|
|
|
|
|
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
|
|
|
|
list_create(&zio->io_parent_list, sizeof (zio_link_t),
|
|
|
|
offsetof(zio_link_t, zl_parent_node));
|
|
|
|
list_create(&zio->io_child_list, sizeof (zio_link_t),
|
|
|
|
offsetof(zio_link_t, zl_child_node));
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd != NULL)
|
|
|
|
zio->io_child_type = ZIO_CHILD_VDEV;
|
|
|
|
else if (flags & ZIO_FLAG_GANG_CHILD)
|
|
|
|
zio->io_child_type = ZIO_CHILD_GANG;
|
2010-05-29 00:45:14 +04:00
|
|
|
else if (flags & ZIO_FLAG_DDT_CHILD)
|
|
|
|
zio->io_child_type = ZIO_CHILD_DDT;
|
2008-12-03 23:09:06 +03:00
|
|
|
else
|
|
|
|
zio->io_child_type = ZIO_CHILD_LOGICAL;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (bp != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_bp = (blkptr_t *)bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_bp_copy = *bp;
|
|
|
|
zio->io_bp_orig = *bp;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (type != ZIO_TYPE_WRITE ||
|
|
|
|
zio->io_child_type == ZIO_CHILD_DDT)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_logical = zio;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
|
|
|
|
pipeline |= ZIO_GANG_STAGES;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio->io_spa = spa;
|
|
|
|
zio->io_txg = txg;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_done = done;
|
|
|
|
zio->io_private = private;
|
|
|
|
zio->io_type = type;
|
|
|
|
zio->io_priority = priority;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_vd = vd;
|
|
|
|
zio->io_offset = offset;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_orig_data = zio->io_data = data;
|
|
|
|
zio->io_orig_size = zio->io_size = size;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_orig_flags = zio->io_flags = flags;
|
|
|
|
zio->io_orig_stage = zio->io_stage = stage;
|
|
|
|
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zb != NULL)
|
|
|
|
zio->io_bookmark = *zb;
|
|
|
|
|
|
|
|
if (pio != NULL) {
|
|
|
|
if (zio->io_logical == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_logical = pio->io_logical;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_GANG)
|
|
|
|
zio->io_gang_leader = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_add_child(pio, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-11-08 04:26:52 +04:00
|
|
|
taskq_init_ent(&zio->io_tqent);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_destroy(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2015-01-30 22:25:19 +03:00
|
|
|
list_destroy(&zio->io_parent_list);
|
|
|
|
list_destroy(&zio->io_child_list);
|
|
|
|
mutex_destroy(&zio->io_lock);
|
|
|
|
cv_destroy(&zio->io_cv);
|
2008-12-03 23:09:06 +03:00
|
|
|
kmem_cache_free(zio_cache, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
|
2010-05-29 00:45:14 +04:00
|
|
|
void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
|
2009-02-18 23:51:31 +03:00
|
|
|
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
return (zio_null(NULL, spa, NULL, done, private, flags));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
|
|
|
|
void *data, uint64_t size, zio_done_func_t *done, void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
|
2008-12-03 23:09:06 +03:00
|
|
|
data, size, done, private,
|
|
|
|
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
|
|
|
ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
2010-05-29 00:45:14 +04:00
|
|
|
void *data, uint64_t size, const zio_prop_t *zp,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
|
|
|
|
void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
|
|
|
|
zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
|
|
|
|
zp->zp_compress >= ZIO_COMPRESS_OFF &&
|
|
|
|
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
|
2012-12-14 03:24:15 +04:00
|
|
|
DMU_OT_IS_VALID(zp->zp_type) &&
|
2008-12-03 23:09:06 +03:00
|
|
|
zp->zp_level < 32 &&
|
2010-05-29 00:45:14 +04:00
|
|
|
zp->zp_copies > 0 &&
|
2013-05-10 23:47:54 +04:00
|
|
|
zp->zp_copies <= spa_max_replication(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
|
|
|
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio->io_ready = ready;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_physdone = physdone;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop = *zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* Data can be NULL if we are going to call zio_write_override() to
|
|
|
|
* provide the already-allocated BP. But we may need the data to
|
|
|
|
* verify a dedup hit (if requested). In this case, don't try to
|
|
|
|
* dedup (just take the already-allocated BP verbatim).
|
|
|
|
*/
|
|
|
|
if (data == NULL && zio->io_prop.zp_dedup_verify) {
|
|
|
|
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
uint64_t size, zio_done_func_t *done, void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
|
|
|
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
|
|
|
|
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* We must reset the io_prop to match the values that existed
|
|
|
|
* when the bp was first written by dmu_sync() keeping in mind
|
|
|
|
* that nopwrite and dedup are mutually exclusive.
|
|
|
|
*/
|
|
|
|
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
|
|
|
|
zio->io_prop.zp_nopwrite = nopwrite;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_prop.zp_copies = copies;
|
|
|
|
zio->io_bp_override = bp;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
|
|
|
|
{
|
2014-06-06 01:19:08 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The check for EMBEDDED is a performance optimization. We
|
|
|
|
* process the free here (by ignoring it) rather than
|
|
|
|
* putting it on the list and then processing it in zio_free_sync().
|
|
|
|
*/
|
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return;
|
2013-09-04 16:00:57 +04:00
|
|
|
metaslab_check_free(spa, bp);
|
2013-07-03 20:13:38 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Frees that are for the currently-syncing txg, are not going to be
|
|
|
|
* deferred, and which will not need to do a read (i.e. not GANG or
|
|
|
|
* DEDUP), can be processed immediately. Otherwise, put them on the
|
|
|
|
* in-memory list for later processing.
|
|
|
|
*/
|
|
|
|
if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
|
|
|
|
txg != spa->spa_syncing_txg ||
|
|
|
|
spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
|
|
|
|
bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
|
|
|
|
} else {
|
|
|
|
VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|
|
|
enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
2013-07-03 20:13:38 +04:00
|
|
|
enum zio_stage stage = ZIO_FREE_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(!BP_IS_HOLE(bp));
|
|
|
|
ASSERT(spa_syncing_txg(spa) == txg);
|
2013-05-06 21:14:52 +04:00
|
|
|
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
metaslab_check_free(spa, bp);
|
2013-10-07 15:30:22 +04:00
|
|
|
arc_freed(spa, bp);
|
2013-09-04 16:00:57 +04:00
|
|
|
|
2013-07-03 20:13:38 +04:00
|
|
|
/*
|
|
|
|
* GANG and DEDUP blocks can induce a read (for the gang block header,
|
|
|
|
* or the DDT), so issue them asynchronously so that this thread is
|
|
|
|
* not tied up.
|
|
|
|
*/
|
|
|
|
if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
|
|
|
|
stage |= ZIO_STAGE_ISSUE_ASYNC;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
|
2013-07-03 20:13:38 +04:00
|
|
|
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
|
|
|
|
NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|
|
|
zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
dprintf_bp(bp, "claiming in txg %llu", txg);
|
|
|
|
|
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* A claim is an allocation of a specific block. Claims are needed
|
|
|
|
* to support immediate writes in the intent log. The issue is that
|
|
|
|
* immediate writes contain committed data, but in a txg that was
|
|
|
|
* *not* committed. Upon opening the pool after an unclean shutdown,
|
|
|
|
* the intent log claims all blocks that contain immediate write data
|
|
|
|
* so that the SPA knows they're in use.
|
|
|
|
*
|
|
|
|
* All claims *must* be resolved in the first txg -- before the SPA
|
|
|
|
* starts allocating blocks -- so that nothing is allocated twice.
|
2010-05-29 00:45:14 +04:00
|
|
|
* If txg == 0 we just verify that the block is claimable.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(txg == spa_first_txg(spa) || txg == 0);
|
|
|
|
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
|
|
|
|
done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
|
|
|
|
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
int c;
|
|
|
|
|
|
|
|
if (vd->vdev_children == 0) {
|
|
|
|
zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
|
|
|
|
|
|
|
|
zio->io_cmd = cmd;
|
|
|
|
} else {
|
2009-02-18 23:51:31 +03:00
|
|
|
zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
for (c = 0; c < vd->vdev_children; c++)
|
|
|
|
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
done, private, flags));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
|
|
|
void *data, int checksum, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_children == 0);
|
|
|
|
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
|
|
|
|
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
|
|
|
|
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
|
2014-09-23 03:42:03 +04:00
|
|
|
ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
|
|
|
|
NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop.zp_checksum = checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
|
|
|
void *data, int checksum, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_children == 0);
|
|
|
|
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
|
|
|
|
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
|
|
|
|
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
|
2014-09-23 03:42:03 +04:00
|
|
|
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
|
|
|
|
NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop.zp_checksum = checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zio_checksum_table[checksum].ci_eck) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* zec checksums are necessarily destructive -- they modify
|
2008-12-03 23:09:06 +03:00
|
|
|
* the end of the write buffer to hold the verifier/checksum.
|
2008-11-20 23:01:55 +03:00
|
|
|
* Therefore, we must make a local copy in case the data is
|
2008-12-03 23:09:06 +03:00
|
|
|
* being written to multiple places in parallel.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
void *wbuf = zio_buf_alloc(size);
|
2008-11-20 23:01:55 +03:00
|
|
|
bcopy(data, wbuf, size);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio, wbuf, size, size, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Create a child I/O to do some work for us.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
void *data, uint64_t size, int type, zio_priority_t priority,
|
|
|
|
enum zio_flag flags, zio_done_func_t *done, void *private)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
ASSERT(vd->vdev_parent ==
|
|
|
|
(pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (type == ZIO_TYPE_READ && bp != NULL) {
|
|
|
|
/*
|
|
|
|
* If we have the bp, then the child should perform the
|
|
|
|
* checksum and the parent need not. This pushes error
|
|
|
|
* detection as close to the leaves as possible and
|
|
|
|
* eliminates redundant checksums in the interior nodes.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
|
|
|
|
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd->vdev_children == 0)
|
|
|
|
offset += VDEV_LABEL_START_SIZE;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we've decided to do a repair, the write is not speculative --
|
|
|
|
* even if the original read was.
|
|
|
|
*/
|
|
|
|
if (flags & ZIO_FLAG_IO_REPAIR)
|
|
|
|
flags &= ~ZIO_FLAG_SPECULATIVE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
|
2010-05-29 00:45:14 +04:00
|
|
|
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
|
|
|
|
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_physdone = pio->io_physdone;
|
|
|
|
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
|
|
|
|
zio->io_logical->io_phys_children++;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *
|
|
|
|
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
int type, zio_priority_t priority, enum zio_flag flags,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_done_func_t *done, void *private)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
|
|
|
|
data, size, done, private, type, priority,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
|
2008-12-03 23:09:06 +03:00
|
|
|
vd, offset, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_flush(zio_t *zio, vdev_t *vd)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
NULL, NULL,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
|
|
|
zio_shrink(zio_t *zio, uint64_t size)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_executor == NULL);
|
|
|
|
ASSERT(zio->io_orig_size == zio->io_size);
|
|
|
|
ASSERT(size <= zio->io_size);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't shrink for raidz because of problems with the
|
|
|
|
* reconstruction when reading back less than the block size.
|
|
|
|
* Note, BP_IS_RAIDZ() assumes no compression.
|
|
|
|
*/
|
|
|
|
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
|
|
|
|
if (!BP_IS_RAIDZ(zio->io_bp))
|
|
|
|
zio->io_orig_size = zio->io_size = size;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Prepare to read and write logical blocks
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_read_bp_init(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_child_type == ZIO_CHILD_LOGICAL &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_RAW)) {
|
2014-06-06 01:19:08 +04:00
|
|
|
uint64_t psize =
|
|
|
|
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
|
2010-05-29 00:45:14 +04:00
|
|
|
void *cbuf = zio_buf_alloc(psize);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
decode_embedded_bp_compressed(bp, zio->io_data);
|
|
|
|
} else {
|
|
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
|
|
|
}
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
|
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
|
|
|
|
|
|
|
|
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
|
|
|
|
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_write_bp_init(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *spa = zio->io_spa;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_compress compress = zp->zp_compress;
|
2008-11-20 23:01:55 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-12-03 23:09:06 +03:00
|
|
|
uint64_t lsize = zio->io_size;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t psize = lsize;
|
2008-12-03 23:09:06 +03:00
|
|
|
int pass = 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If our children haven't all reached the ready stage,
|
|
|
|
* wait for them and then repeat this pipeline stage.
|
|
|
|
*/
|
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
|
|
|
|
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (!IO_IS_ALLOCATING(zio))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
|
|
|
|
|
|
|
|
if (zio->io_bp_override) {
|
|
|
|
ASSERT(bp->blk_birth != zio->io_txg);
|
|
|
|
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
|
|
|
|
|
|
|
|
*bp = *zio->io_bp_override;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* If we've been overridden and nopwrite is set then
|
|
|
|
* set the flag accordingly to indicate that a nopwrite
|
|
|
|
* has already occurred.
|
|
|
|
*/
|
|
|
|
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
|
|
|
|
ASSERT(!zp->zp_dedup);
|
|
|
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(!zp->zp_nopwrite);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
|
|
|
|
zp->zp_dedup_verify);
|
|
|
|
|
|
|
|
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
|
|
|
|
BP_SET_DEDUP(bp, 1);
|
|
|
|
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We're rewriting an existing block, which means we're
|
|
|
|
* working on behalf of spa_sync(). For spa_sync() to
|
|
|
|
* converge, it must eventually be the case that we don't
|
|
|
|
* have to allocate new blocks. But compression changes
|
|
|
|
* the blocksize, which forces a reallocate, and makes
|
|
|
|
* convergence take longer. Therefore, after the first
|
|
|
|
* few passes, stop compressing to ensure convergence.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
pass = spa_sync_pass(spa);
|
|
|
|
|
|
|
|
ASSERT(zio->io_txg == spa_syncing_txg(spa));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!BP_GET_DEDUP(bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-05-06 21:14:52 +04:00
|
|
|
if (pass >= zfs_sync_pass_dont_compress)
|
2008-12-03 23:09:06 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* Make sure someone doesn't change their mind on overwrites */
|
2014-06-06 01:19:08 +04:00
|
|
|
ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (compress != ZIO_COMPRESS_OFF) {
|
2010-05-29 00:45:14 +04:00
|
|
|
void *cbuf = zio_buf_alloc(lsize);
|
|
|
|
psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
|
|
|
|
if (psize == 0 || psize == lsize) {
|
2008-12-03 23:09:06 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_buf_free(cbuf, lsize);
|
2014-06-06 01:19:08 +04:00
|
|
|
} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
|
|
|
|
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
|
|
|
|
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
|
|
|
|
encode_embedded_bp_compressed(bp,
|
|
|
|
cbuf, compress, lsize, psize);
|
|
|
|
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
|
|
|
|
BP_SET_TYPE(bp, zio->io_prop.zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
|
|
|
|
zio_buf_free(cbuf, lsize);
|
|
|
|
bp->blk_birth = zio->io_txg;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
ASSERT(spa_feature_is_active(spa,
|
|
|
|
SPA_FEATURE_EMBEDDED_DATA));
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* Round up compressed size to MINBLOCKSIZE and
|
|
|
|
* zero the tail.
|
|
|
|
*/
|
|
|
|
size_t rounded =
|
|
|
|
P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
|
|
|
|
if (rounded > psize) {
|
|
|
|
bzero((char *)cbuf + psize, rounded - psize);
|
|
|
|
psize = rounded;
|
|
|
|
}
|
|
|
|
if (psize == lsize) {
|
|
|
|
compress = ZIO_COMPRESS_OFF;
|
|
|
|
zio_buf_free(cbuf, lsize);
|
|
|
|
} else {
|
|
|
|
zio_push_transform(zio, cbuf,
|
|
|
|
psize, lsize, NULL);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* The final pass of spa_sync() must be all rewrites, but the first
|
|
|
|
* few passes offer a trade-off: allocating blocks defers convergence,
|
|
|
|
* but newly allocated blocks are sequential, so they can be written
|
|
|
|
* to disk faster. Therefore, we allow the first few passes of
|
|
|
|
* spa_sync() to allocate new blocks, but force rewrites after that.
|
|
|
|
* There should only be a handful of blocks after pass 1 in any case.
|
|
|
|
*/
|
2013-12-09 22:37:51 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
|
|
|
|
BP_GET_PSIZE(bp) == psize &&
|
2013-05-06 21:14:52 +04:00
|
|
|
pass >= zfs_sync_pass_rewrite) {
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
|
2010-08-26 20:52:39 +04:00
|
|
|
ASSERT(psize != 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
|
|
|
|
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
|
|
|
|
} else {
|
|
|
|
BP_ZERO(bp);
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (psize == 0) {
|
2013-12-09 22:37:51 +04:00
|
|
|
if (zio->io_bp_orig.blk_birth != 0 &&
|
|
|
|
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
|
|
|
BP_SET_TYPE(bp, zp->zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zp->zp_level);
|
|
|
|
BP_SET_BIRTH(bp, zio->io_txg, 0);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
} else {
|
|
|
|
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
2013-12-09 22:37:51 +04:00
|
|
|
BP_SET_TYPE(bp, zp->zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zp->zp_level);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_PSIZE(bp, psize);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_COMPRESS(bp, compress);
|
|
|
|
BP_SET_CHECKSUM(bp, zp->zp_checksum);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(bp, zp->zp_dedup);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->zp_dedup) {
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
|
|
|
|
}
|
2013-05-10 23:47:54 +04:00
|
|
|
if (zp->zp_nopwrite) {
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_free_bp_init(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
|
|
|
|
if (BP_GET_DEDUP(bp))
|
|
|
|
zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Execute the I/O pipeline
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void
|
2013-05-06 23:24:30 +04:00
|
|
|
zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *spa = zio->io_spa;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_type_t t = zio->io_type;
|
2011-11-08 04:26:52 +04:00
|
|
|
int flags = (cutinline ? TQ_FRONT : 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* If we're a config writer or a probe, the normal issue and
|
|
|
|
* interrupt threads may all be blocked waiting for the config lock.
|
|
|
|
* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
|
2008-12-03 23:09:06 +03:00
|
|
|
t = ZIO_TYPE_NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
|
|
|
|
t = ZIO_TYPE_NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
2013-05-06 23:24:30 +04:00
|
|
|
* If this is a high priority I/O, then use the high priority taskq if
|
|
|
|
* available.
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
|
|
|
if (zio->io_priority == ZIO_PRIORITY_NOW &&
|
2013-05-06 23:24:30 +04:00
|
|
|
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
q++;
|
|
|
|
|
|
|
|
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
|
2010-08-26 21:32:23 +04:00
|
|
|
|
2011-11-08 04:26:52 +04:00
|
|
|
/*
|
|
|
|
* NB: We are assuming that the zio can only be dispatched
|
|
|
|
* to a single taskq at a time. It would be a grievous error
|
|
|
|
* to dispatch the zio to another taskq at the same time.
|
|
|
|
*/
|
|
|
|
ASSERT(taskq_empty_ent(&zio->io_tqent));
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
|
|
|
|
flags, &zio->io_tqent);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static boolean_t
|
2013-05-06 23:24:30 +04:00
|
|
|
zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
kthread_t *executor = zio->io_executor;
|
|
|
|
spa_t *spa = zio->io_spa;
|
2010-08-26 20:52:39 +04:00
|
|
|
zio_type_t t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-05-06 23:24:30 +04:00
|
|
|
for (t = 0; t < ZIO_TYPES; t++) {
|
|
|
|
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
|
|
|
|
uint_t i;
|
|
|
|
for (i = 0; i < tqs->stqs_count; i++) {
|
|
|
|
if (taskq_member(tqs->stqs_taskq[i], executor))
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (B_FALSE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_issue_async(zio_t *zio)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_interrupt(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Execute the I/O pipeline until one of the following occurs:
|
|
|
|
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
|
|
|
* dependent child I/Os; (3) the I/O issues, so we're waiting
|
|
|
|
* for an I/O completion interrupt; (4) the I/O is delegated by
|
|
|
|
* vdev-level caching or aggregation; (5) the I/O is deferred
|
|
|
|
* due to vdev-level queueing; (6) the I/O is handed off to
|
|
|
|
* another thread. In all cases, the pipeline stops whenever
|
2013-07-03 06:00:16 +04:00
|
|
|
* there's no CPU work; it never burns a thread in cv_wait_io().
|
2008-12-03 23:09:06 +03:00
|
|
|
*
|
|
|
|
* There's no locking on io_stage because there's no legitimate way
|
|
|
|
* for multiple threads to be attempting to process the same I/O.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static zio_pipe_stage_t *zio_pipeline[];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
/*
|
|
|
|
* zio_execute() is a wrapper around the static function
|
|
|
|
* __zio_execute() so that we can force __zio_execute() to be
|
|
|
|
* inlined. This reduces stack overhead which is important
|
|
|
|
* because __zio_execute() is called recursively in several zio
|
|
|
|
* code paths. zio_execute() itself cannot be inlined because
|
|
|
|
* it is externally visible.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_execute(zio_t *zio)
|
2010-08-26 22:38:38 +04:00
|
|
|
{
|
2014-07-13 22:35:19 +04:00
|
|
|
fstrans_cookie_t cookie;
|
|
|
|
|
|
|
|
cookie = spl_fstrans_mark();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2014-07-13 22:35:19 +04:00
|
|
|
spl_fstrans_unmark(cookie);
|
2010-08-26 22:38:38 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline void
|
|
|
|
__zio_execute(zio_t *zio)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
zio->io_executor = curthread;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
while (zio->io_stage < ZIO_STAGE_DONE) {
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage pipeline = zio->io_pipeline;
|
|
|
|
enum zio_stage stage = zio->io_stage;
|
2012-12-18 04:23:27 +04:00
|
|
|
dsl_pool_t *dp;
|
2011-05-26 02:22:04 +04:00
|
|
|
boolean_t cut;
|
2008-12-03 23:09:06 +03:00
|
|
|
int rv;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(!MUTEX_HELD(&zio->io_lock));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(ISP2(stage));
|
|
|
|
ASSERT(zio->io_stall == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
do {
|
|
|
|
stage <<= 1;
|
|
|
|
} while ((stage & pipeline) == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
ASSERT(stage <= ZIO_STAGE_DONE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2012-12-18 04:23:27 +04:00
|
|
|
dp = spa_get_dsl(zio->io_spa);
|
2011-05-26 02:22:04 +04:00
|
|
|
cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
|
|
|
|
zio_requeue_io_start_cut_in_line : B_FALSE;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* If we are in interrupt context and this pipeline stage
|
|
|
|
* will grab a config lock that is held across I/O,
|
2010-05-29 00:45:14 +04:00
|
|
|
* or may wait for an I/O that needs an interrupt thread
|
|
|
|
* to complete, issue async to avoid deadlock.
|
|
|
|
*
|
|
|
|
* For VDEV_IO_START, we cut in line so that the io will
|
|
|
|
* be sent to disk promptly.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2012-12-18 04:23:27 +04:00
|
|
|
if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
|
|
|
|
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we executing in the context of the tx_sync_thread,
|
|
|
|
* or we are performing pool initialization outside of a
|
2013-08-15 03:18:58 +04:00
|
|
|
* zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context.
|
|
|
|
* Then issue the zio asynchronously to minimize stack usage
|
|
|
|
* for these deep call paths.
|
2012-12-18 04:23:27 +04:00
|
|
|
*/
|
|
|
|
if ((dp && curthread == dp->dp_tx.tx_sync_thread) ||
|
|
|
|
(dp && spa_is_initializing(dp->dp_spa) &&
|
2013-08-15 03:18:58 +04:00
|
|
|
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
|
|
|
|
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
|
2008-12-03 23:09:06 +03:00
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_stage = stage;
|
2014-04-16 07:40:22 +04:00
|
|
|
rv = zio_pipeline[highbit64(stage) - 1](zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (rv == ZIO_PIPELINE_STOP)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(rv == ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Initiate I/O, either sync or async
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zio_wait(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
|
|
|
|
ASSERT(zio->io_executor == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_waiter = curthread;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
2012-12-21 06:15:34 +04:00
|
|
|
while (zio->io_executor != NULL)
|
2012-12-21 06:40:20 +04:00
|
|
|
cv_wait_io(&zio->io_cv, &zio->io_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&zio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
error = zio->io_error;
|
|
|
|
zio_destroy(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_nowait(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_executor == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
|
|
|
|
zio_unique_parent(zio) == NULL) {
|
2014-10-08 00:20:49 +04:00
|
|
|
zio_t *pio;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* This is a logical async I/O with no parent to wait for it.
|
2009-07-03 02:44:48 +04:00
|
|
|
* We add it to the spa_async_root_zio "Godfather" I/O which
|
|
|
|
* will ensure they complete prior to unloading the pool.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_t *spa = zio->io_spa;
|
2014-10-08 00:20:49 +04:00
|
|
|
kpreempt_disable();
|
|
|
|
pio = spa->spa_async_zio_root[CPU_SEQID];
|
|
|
|
kpreempt_enable();
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2014-10-08 00:20:49 +04:00
|
|
|
zio_add_child(pio, zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Reexecute or suspend/resume failed I/O
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_reexecute(zio_t *pio)
|
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *cio, *cio_next;
|
2010-08-26 20:52:39 +04:00
|
|
|
int c, w;
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(pio->io_gang_leader == NULL);
|
|
|
|
ASSERT(pio->io_gang_tree == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_flags = pio->io_orig_flags;
|
|
|
|
pio->io_stage = pio->io_orig_stage;
|
|
|
|
pio->io_pipeline = pio->io_orig_pipeline;
|
|
|
|
pio->io_reexecute = 0;
|
2013-05-10 23:47:54 +04:00
|
|
|
pio->io_flags |= ZIO_FLAG_REEXECUTED;
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_error = 0;
|
2010-08-26 20:52:39 +04:00
|
|
|
for (w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_state[w] = 0;
|
2010-08-26 20:52:39 +04:00
|
|
|
for (c = 0; c < ZIO_CHILD_TYPES; c++)
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_child_error[c] = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (IO_IS_ALLOCATING(pio))
|
|
|
|
BP_ZERO(pio->io_bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* As we reexecute pio's children, new children could be created.
|
2009-02-18 23:51:31 +03:00
|
|
|
* New children go to the head of pio's io_child_list, however,
|
2008-12-03 23:09:06 +03:00
|
|
|
* so we will (correctly) not reexecute them. The key is that
|
2009-02-18 23:51:31 +03:00
|
|
|
* the remainder of pio's io_child_list, from 'cio_next' onward,
|
|
|
|
* cannot be affected by any side effects of reexecuting 'cio'.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2009-02-18 23:51:31 +03:00
|
|
|
for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio);
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2010-08-26 20:52:39 +04:00
|
|
|
for (w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_children[cio->io_child_type][w]++;
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_reexecute(cio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Now that all children have been reexecuted, execute the parent.
|
2009-07-03 02:44:48 +04:00
|
|
|
* We don't reexecute "The Godfather" I/O here as it's the
|
|
|
|
* responsibility of the caller to wait on him.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_suspend(spa_t *spa, zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
|
|
|
|
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
|
|
|
|
"failure and the failure mode property for this pool "
|
|
|
|
"is set to panic.", spa_name(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-07-11 02:13:09 +04:00
|
|
|
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
|
|
|
|
"failure and has been suspended.\n", spa_name(spa));
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (spa->spa_suspend_zio_root == NULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
|
|
|
|
ZIO_FLAG_GODFATHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
spa->spa_suspended = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio != NULL) {
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio != spa->spa_suspend_zio_root);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2009-02-18 23:51:31 +03:00
|
|
|
ASSERT(zio_unique_parent(zio) == NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_DONE);
|
|
|
|
zio_add_child(spa->spa_suspend_zio_root, zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_resume(spa_t *spa)
|
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *pio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Reexecute all previously suspended i/o.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
|
|
|
spa->spa_suspended = B_FALSE;
|
|
|
|
cv_broadcast(&spa->spa_suspend_cv);
|
|
|
|
pio = spa->spa_suspend_zio_root;
|
|
|
|
spa->spa_suspend_zio_root = NULL;
|
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
|
|
|
|
|
|
|
if (pio == NULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
return (0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_reexecute(pio);
|
|
|
|
return (zio_wait(pio));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_resume_wait(spa_t *spa)
|
|
|
|
{
|
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
|
|
|
while (spa_suspended(spa))
|
|
|
|
cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
|
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Gang blocks.
|
|
|
|
*
|
|
|
|
* A gang block is a collection of small blocks that looks to the DMU
|
|
|
|
* like one large block. When zio_dva_allocate() cannot find a block
|
|
|
|
* of the requested size, due to either severe fragmentation or the pool
|
|
|
|
* being nearly full, it calls zio_write_gang_block() to construct the
|
|
|
|
* block from smaller fragments.
|
|
|
|
*
|
|
|
|
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
|
|
|
|
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
|
|
|
|
* an indirect block: it's an array of block pointers. It consumes
|
|
|
|
* only one sector and hence is allocatable regardless of fragmentation.
|
|
|
|
* The gang header's bps point to its gang members, which hold the data.
|
|
|
|
*
|
|
|
|
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
|
|
|
|
* as the verifier to ensure uniqueness of the SHA256 checksum.
|
|
|
|
* Critically, the gang block bp's blk_cksum is the checksum of the data,
|
|
|
|
* not the gang header. This ensures that data block signatures (needed for
|
|
|
|
* deduplication) are independent of how the block is physically stored.
|
|
|
|
*
|
|
|
|
* Gang blocks can be nested: a gang member may itself be a gang block.
|
|
|
|
* Thus every gang block is a tree in which root and all interior nodes are
|
|
|
|
* gang headers, and the leaves are normal blocks that contain user data.
|
|
|
|
* The root of the gang tree is called the gang leader.
|
|
|
|
*
|
|
|
|
* To perform any operation (read, rewrite, free, claim) on a gang block,
|
|
|
|
* zio_gang_assemble() first assembles the gang tree (minus data leaves)
|
|
|
|
* in the io_gang_tree field of the original logical i/o by recursively
|
|
|
|
* reading the gang leader and all gang headers below it. This yields
|
|
|
|
* an in-core tree containing the contents of every gang header and the
|
|
|
|
* bps for every constituent of the gang block.
|
|
|
|
*
|
|
|
|
* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
|
|
|
|
* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
|
|
|
|
* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
|
|
|
|
* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
|
|
|
|
* zio_read_gang() is a wrapper around zio_read() that omits reading gang
|
|
|
|
* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
|
|
|
|
* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
|
|
|
|
* of the gang header plus zio_checksum_compute() of the data to update the
|
|
|
|
* gang header's blk_cksum as described above.
|
|
|
|
*
|
|
|
|
* The two-phase assemble/issue model solves the problem of partial failure --
|
|
|
|
* what if you'd freed part of a gang block but then couldn't read the
|
|
|
|
* gang header for another part? Assembling the entire gang tree first
|
|
|
|
* ensures that all the necessary gang header I/O has succeeded before
|
|
|
|
* starting the actual work of free, claim, or write. Once the gang tree
|
|
|
|
* is assembled, free and claim are in-memory operations that cannot fail.
|
|
|
|
*
|
|
|
|
* In the event that a gang write fails, zio_dva_unallocate() walks the
|
|
|
|
* gang tree to immediately free (i.e. insert back into the space map)
|
|
|
|
* everything we've allocated. This ensures that we don't get ENOSPC
|
|
|
|
* errors during repeated suspend/resume cycles due to a flaky device.
|
|
|
|
*
|
|
|
|
* Gang rewrites only happen during sync-to-convergence. If we can't assemble
|
|
|
|
* the gang tree, we won't modify the block, so we can safely defer the free
|
|
|
|
* (knowing that the block is still intact). If we *can* assemble the gang
|
|
|
|
* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
|
|
|
|
* each constituent bp and we can allocate a new block on the next sync pass.
|
|
|
|
*
|
|
|
|
* In all cases, the gang tree allows complete recovery from partial failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
static zio_t *
|
|
|
|
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn != NULL)
|
|
|
|
return (pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
|
|
|
|
NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
|
|
|
|
&pio->io_bookmark));
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
|
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
if (gn != NULL) {
|
|
|
|
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* As we rewrite each gang header, the pipeline will compute
|
|
|
|
* a new gang block header checksum for it; but no one will
|
|
|
|
* compute a new data checksum, so we do that here. The one
|
|
|
|
* exception is the gang leader: the pipeline already computed
|
|
|
|
* its data checksum because that stage precedes gang assembly.
|
|
|
|
* (Presently, nothing actually uses interior data checksums;
|
|
|
|
* this is just good hygiene.)
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (gn != pio->io_gang_leader->io_gang_tree) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
|
|
|
|
data, BP_GET_PSIZE(bp));
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* If we are here to damage data for testing purposes,
|
|
|
|
* leave the GBH alone so that we can detect the damage.
|
|
|
|
*/
|
|
|
|
if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
|
|
|
|
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* ARGSUSED */
|
|
|
|
zio_t *
|
|
|
|
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio)));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* ARGSUSED */
|
|
|
|
zio_t *
|
|
|
|
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
|
|
|
|
NULL,
|
|
|
|
zio_read_gang,
|
|
|
|
zio_rewrite_gang,
|
|
|
|
zio_free_gang,
|
|
|
|
zio_claim_gang,
|
|
|
|
NULL
|
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void zio_gang_tree_assemble_done(zio_t *zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static zio_gang_node_t *
|
|
|
|
zio_gang_node_alloc(zio_gang_node_t **gnpp)
|
|
|
|
{
|
|
|
|
zio_gang_node_t *gn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(*gnpp == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
|
2008-12-03 23:09:06 +03:00
|
|
|
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
|
|
|
|
*gnpp = gn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (gn);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_free(zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = *gnpp;
|
2010-08-26 20:52:39 +04:00
|
|
|
int g;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(gn->gn_child[g] == NULL);
|
|
|
|
|
|
|
|
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
|
|
|
kmem_free(gn, sizeof (*gn));
|
|
|
|
*gnpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_free(zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = *gnpp;
|
2010-08-26 20:52:39 +04:00
|
|
|
int g;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn == NULL)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_tree_free(&gn->gn_child[g]);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_free(gnpp);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(gio->io_gang_leader == gio);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_IS_GANG(bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
|
2008-12-03 23:09:06 +03:00
|
|
|
SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
|
2009-07-03 02:44:48 +04:00
|
|
|
gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_assemble_done(zio_t *zio)
|
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = zio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = zio->io_private;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2010-08-26 20:52:39 +04:00
|
|
|
int g;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(gio == zio_unique_parent(zio));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_count == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_SHOULD_BYTESWAP(bp))
|
|
|
|
byteswap_uint64_array(zio->io_data, zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_data == gn->gn_gbh);
|
|
|
|
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
|
|
|
|
if (!BP_IS_GANG(gbp))
|
|
|
|
continue;
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
2010-08-26 20:52:39 +04:00
|
|
|
int g;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_IS_GANG(bp) == !!gn);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
|
|
|
|
ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If you're a gang header, your data is in gn->gn_gbh.
|
|
|
|
* If you're a gang member, your data is in 'data' and gn == NULL.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
|
|
|
|
if (BP_IS_HOLE(gbp))
|
|
|
|
continue;
|
|
|
|
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
|
|
|
|
data = (char *)data + BP_GET_PSIZE(gbp);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (gn == gio->io_gang_tree)
|
|
|
|
ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio != pio)
|
|
|
|
zio_nowait(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_assemble(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
zio->io_gang_leader = zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_issue(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
|
2008-12-03 23:09:06 +03:00
|
|
|
else
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_free(&zio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_gang_member_ready(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
dva_t *cdva = zio->io_bp->blk_dva;
|
|
|
|
dva_t *pdva = pio->io_bp->blk_dva;
|
|
|
|
uint64_t asize;
|
2010-08-26 20:52:39 +04:00
|
|
|
int d;
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERTV(zio_t *gio = zio->io_gang_leader);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_IS_HOLE(zio->io_bp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
|
|
|
|
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
|
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
|
|
|
|
ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
2010-08-26 20:52:39 +04:00
|
|
|
for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(DVA_GET_GANG(&pdva[d]));
|
|
|
|
asize = DVA_GET_ASIZE(&pdva[d]);
|
|
|
|
asize += DVA_GET_ASIZE(&cdva[d]);
|
|
|
|
DVA_SET_ASIZE(&pdva[d], asize);
|
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_gang_block(zio_t *pio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_t *spa = pio->io_spa;
|
|
|
|
blkptr_t *bp = pio->io_bp;
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
|
|
|
zio_gang_node_t *gn, **gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_gbh_phys_t *gbh;
|
2008-12-03 23:09:06 +03:00
|
|
|
uint64_t txg = pio->io_txg;
|
|
|
|
uint64_t resid = pio->io_size;
|
|
|
|
uint64_t lsize;
|
2010-05-29 00:45:14 +04:00
|
|
|
int copies = gio->io_prop.zp_copies;
|
|
|
|
int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_prop_t zp;
|
2010-08-26 20:52:39 +04:00
|
|
|
int g, error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
|
|
|
|
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
|
2008-12-03 23:09:06 +03:00
|
|
|
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_error = error;
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (pio == gio) {
|
|
|
|
gnpp = &gio->io_gang_tree;
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
gnpp = pio->io_private;
|
|
|
|
ASSERT(pio->io_ready == zio_write_gang_member_ready);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
gn = zio_gang_node_alloc(gnpp);
|
|
|
|
gbh = gn->gn_gbh;
|
|
|
|
bzero(gbh, SPA_GANGBLOCKSIZE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Create the gang header.
|
|
|
|
*/
|
|
|
|
zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
|
|
|
|
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Create and nowait the gang children.
|
|
|
|
*/
|
2010-08-26 20:52:39 +04:00
|
|
|
for (g = 0; resid != 0; resid -= lsize, g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
|
|
|
|
SPA_MINBLOCKSIZE);
|
|
|
|
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zp.zp_checksum = gio->io_prop.zp_checksum;
|
2008-12-03 23:09:06 +03:00
|
|
|
zp.zp_compress = ZIO_COMPRESS_OFF;
|
|
|
|
zp.zp_type = DMU_OT_NONE;
|
|
|
|
zp.zp_level = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
zp.zp_copies = gio->io_prop.zp_copies;
|
2013-05-10 23:47:54 +04:00
|
|
|
zp.zp_dedup = B_FALSE;
|
|
|
|
zp.zp_dedup_verify = B_FALSE;
|
|
|
|
zp.zp_nopwrite = B_FALSE;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
|
|
|
|
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
|
|
|
|
&pio->io_bookmark));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Set pio's pipeline to just wait for zio to finish.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
/*
|
|
|
|
* We didn't allocate this bp, so make sure it doesn't get unmarked.
|
|
|
|
*/
|
|
|
|
pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_nowait(zio);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* The zio_nop_write stage in the pipeline determines if allocating
|
|
|
|
* a new bp is necessary. By leveraging a cryptographically secure checksum,
|
|
|
|
* such as SHA256, we can compare the checksums of the new data and the old
|
|
|
|
* to determine if allocating a new block is required. The nopwrite
|
|
|
|
* feature can handle writes in either syncing or open context (i.e. zil
|
|
|
|
* writes) and as a result is mutually exclusive with dedup.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zio_nop_write(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_LEVEL(bp) == 0);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
ASSERT(zp->zp_nopwrite);
|
|
|
|
ASSERT(!zp->zp_dedup);
|
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check to see if the original bp and the new bp have matching
|
|
|
|
* characteristics (i.e. same checksum, compression algorithms, etc).
|
|
|
|
* If they don't then just continue with the pipeline which will
|
|
|
|
* allocate a new bp.
|
|
|
|
*/
|
|
|
|
if (BP_IS_HOLE(bp_orig) ||
|
|
|
|
!zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
|
|
|
|
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
|
|
|
|
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
|
|
|
|
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
|
|
|
|
zp->zp_copies != BP_GET_NDVAS(bp_orig))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the checksums match then reset the pipeline so that we
|
|
|
|
* avoid allocating a new bp and issuing any I/O.
|
|
|
|
*/
|
|
|
|
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
|
|
|
|
ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
|
|
|
|
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
|
|
|
|
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
|
|
|
|
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
|
|
|
|
ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
|
|
|
|
sizeof (uint64_t)) == 0);
|
|
|
|
|
|
|
|
*bp = *bp_orig;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2010-05-29 00:45:14 +04:00
|
|
|
* Dedup
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
zio_ddt_child_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
ddp = ddt_phys_select(dde, bp);
|
|
|
|
if (zio->io_error == 0)
|
|
|
|
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
|
|
|
|
if (zio->io_error == 0 && dde->dde_repair_data == NULL)
|
|
|
|
dde->dde_repair_data = zio->io_data;
|
|
|
|
else
|
|
|
|
zio_buf_free(zio->io_data, zio->io_size);
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_ddt_read_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2010-08-26 20:52:39 +04:00
|
|
|
int p;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = ddt_repair_start(ddt, bp);
|
|
|
|
ddt_phys_t *ddp = dde->dde_phys;
|
|
|
|
ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
|
|
|
|
blkptr_t blk;
|
|
|
|
|
|
|
|
ASSERT(zio->io_vsd == NULL);
|
|
|
|
zio->io_vsd = dde;
|
|
|
|
|
|
|
|
if (ddp_self == NULL)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
|
|
|
|
continue;
|
|
|
|
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
|
|
|
|
&blk);
|
|
|
|
zio_nowait(zio_read(zio, zio->io_spa, &blk,
|
|
|
|
zio_buf_alloc(zio->io_size), zio->io_size,
|
|
|
|
zio_ddt_child_read_done, dde, zio->io_priority,
|
|
|
|
ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
|
|
|
|
&zio->io_bookmark));
|
|
|
|
}
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_nowait(zio_read(zio, zio->io_spa, bp,
|
|
|
|
zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
|
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_ddt_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = zio->io_vsd;
|
|
|
|
if (ddt == NULL) {
|
|
|
|
ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
if (dde == NULL) {
|
|
|
|
zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
}
|
|
|
|
if (dde->dde_repair_data != NULL) {
|
|
|
|
bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
|
|
|
|
zio->io_child_error[ZIO_CHILD_DDT] = 0;
|
|
|
|
}
|
|
|
|
ddt_repair_done(ddt, dde);
|
|
|
|
zio->io_vsd = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(zio->io_vsd == NULL);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
|
|
|
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2010-08-26 20:52:39 +04:00
|
|
|
int p;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: we compare the original data, not the transformed data,
|
|
|
|
* because when zio->io_bp is an override bp, we will not have
|
|
|
|
* pushed the I/O transforms. That's an important optimization
|
|
|
|
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
|
|
|
|
*/
|
2010-08-26 20:52:39 +04:00
|
|
|
for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_t *lio = dde->dde_lead_zio[p];
|
|
|
|
|
|
|
|
if (lio != NULL) {
|
|
|
|
return (lio->io_orig_size != zio->io_orig_size ||
|
|
|
|
bcmp(zio->io_orig_data, lio->io_orig_data,
|
|
|
|
zio->io_orig_size) != 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
if (ddp->ddp_phys_birth != 0) {
|
|
|
|
arc_buf_t *abuf = NULL;
|
|
|
|
uint32_t aflags = ARC_WAIT;
|
|
|
|
blkptr_t blk = *zio->io_bp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
error = arc_read(NULL, spa, &blk,
|
2010-05-29 00:45:14 +04:00
|
|
|
arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&aflags, &zio->io_bookmark);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
if (arc_buf_size(abuf) != zio->io_orig_size ||
|
|
|
|
bcmp(abuf->b_data, zio->io_orig_data,
|
|
|
|
zio->io_orig_size) != 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EEXIST);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(abuf, &abuf));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
return (error != 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_child_write_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = zio->io_prop.zp_copies;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
zio_t *pio;
|
|
|
|
|
|
|
|
if (zio->io_error)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
|
|
|
|
ddt_phys_fill(ddp, zio->io_bp);
|
|
|
|
|
|
|
|
while ((pio = zio_walk_parents(zio)) != NULL)
|
|
|
|
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_child_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = zio->io_prop.zp_copies;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(ddp->ddp_refcnt == 0);
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
dde->dde_lead_zio[p] = NULL;
|
|
|
|
|
|
|
|
if (zio->io_error == 0) {
|
|
|
|
while (zio_walk_parents(zio) != NULL)
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else {
|
|
|
|
ddt_phys_clear(ddp);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_ditto_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = DDT_PHYS_DITTO;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
ddt_key_t *ddk = &dde->dde_key;
|
2010-08-26 20:53:00 +04:00
|
|
|
ASSERTV(zio_prop_t *zp = &zio->io_prop);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(ddp->ddp_refcnt == 0);
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
dde->dde_lead_zio[p] = NULL;
|
|
|
|
|
|
|
|
if (zio->io_error == 0) {
|
|
|
|
ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
|
|
|
|
ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
|
|
|
|
ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
|
|
|
|
if (ddp->ddp_phys_birth != 0)
|
|
|
|
ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
|
|
|
|
ddt_phys_fill(ddp, bp);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_ddt_write(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t txg = zio->io_txg;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
int p = zp->zp_copies;
|
|
|
|
int ditto_copies;
|
|
|
|
zio_t *cio = NULL;
|
|
|
|
zio_t *dio = NULL;
|
|
|
|
ddt_t *ddt = ddt_select(spa, bp);
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
|
|
|
|
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
dde = ddt_lookup(ddt, bp, B_TRUE);
|
|
|
|
ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
|
|
|
|
/*
|
|
|
|
* If we're using a weak checksum, upgrade to a strong checksum
|
|
|
|
* and try again. If we're already using a strong checksum,
|
|
|
|
* we can't resolve it, so just convert to an ordinary write.
|
|
|
|
* (And automatically e-mail a paper to Nature?)
|
|
|
|
*/
|
|
|
|
if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
|
|
|
|
zp->zp_checksum = spa_dedup_checksum(spa);
|
|
|
|
zio_pop_transforms(zio);
|
|
|
|
zio->io_stage = ZIO_STAGE_OPEN;
|
|
|
|
BP_ZERO(bp);
|
|
|
|
} else {
|
2013-05-10 23:47:54 +04:00
|
|
|
zp->zp_dedup = B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
ddt_exit(ddt);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
|
|
|
|
ASSERT(ditto_copies < SPA_DVAS_PER_BP);
|
|
|
|
|
|
|
|
if (ditto_copies > ddt_ditto_copies_present(dde) &&
|
|
|
|
dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
|
|
|
|
zio_prop_t czp = *zp;
|
|
|
|
|
|
|
|
czp.zp_copies = ditto_copies;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we arrived here with an override bp, we won't have run
|
|
|
|
* the transform stack, so we won't have the data we need to
|
|
|
|
* generate a child i/o. So, toss the override bp and restart.
|
|
|
|
* This is safe, because using the override bp is just an
|
|
|
|
* optimization; and it's rare, so the cost doesn't matter.
|
|
|
|
*/
|
|
|
|
if (zio->io_bp_override) {
|
|
|
|
zio_pop_transforms(zio);
|
|
|
|
zio->io_stage = ZIO_STAGE_OPEN;
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
zio->io_bp_override = NULL;
|
|
|
|
BP_ZERO(bp);
|
|
|
|
ddt_exit(ddt);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_orig_size, &czp, NULL, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_ditto_write_done, dde, zio->io_priority,
|
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
|
|
|
|
|
|
|
zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
|
|
|
|
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
|
|
|
|
if (ddp->ddp_phys_birth != 0)
|
|
|
|
ddt_bp_fill(ddp, bp, txg);
|
|
|
|
if (dde->dde_lead_zio[p] != NULL)
|
|
|
|
zio_add_child(zio, dde->dde_lead_zio[p]);
|
|
|
|
else
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else if (zio->io_bp_override) {
|
|
|
|
ASSERT(bp->blk_birth == txg);
|
|
|
|
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
|
|
|
|
ddt_phys_fill(ddp, bp);
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else {
|
|
|
|
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_child_write_done, dde, zio->io_priority,
|
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
|
|
|
|
|
|
|
zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
|
|
|
|
dde->dde_lead_zio[p] = cio;
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
|
|
|
|
if (cio)
|
|
|
|
zio_nowait(cio);
|
|
|
|
if (dio)
|
|
|
|
zio_nowait(dio);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_entry_t *freedde; /* for debugging */
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static int
|
|
|
|
zio_ddt_free(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_t *ddt = ddt_select(spa, bp);
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
|
2013-03-19 23:05:08 +04:00
|
|
|
if (dde) {
|
|
|
|
ddp = ddt_phys_select(dde, bp);
|
|
|
|
if (ddp)
|
|
|
|
ddt_phys_decref(ddp);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_exit(ddt);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Allocate and free blocks
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
|
|
|
zio_dva_allocate(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_class_t *mc = spa_normal_class(spa);
|
2008-11-20 23:01:55 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int error;
|
2011-07-26 23:08:52 +04:00
|
|
|
int flags = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_gang_leader == NULL) {
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
zio->io_gang_leader = zio;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(BP_IS_HOLE(bp));
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(BP_GET_NDVAS(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, >, 0);
|
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
|
|
|
|
|
2011-07-26 23:08:52 +04:00
|
|
|
/*
|
|
|
|
* The dump device does not support gang blocks so allocation on
|
|
|
|
* behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
|
|
|
|
* the "fast" gang feature.
|
|
|
|
*/
|
|
|
|
flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
|
|
|
|
flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
|
|
|
|
METASLAB_GANG_CHILD : 0;
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
2011-07-26 23:08:52 +04:00
|
|
|
zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (error) {
|
2011-07-26 23:08:52 +04:00
|
|
|
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
|
|
|
|
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
|
|
|
|
error);
|
2008-12-03 23:09:06 +03:00
|
|
|
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
|
|
|
|
return (zio_write_gang_block(zio));
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = error;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_dva_free(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_dva_claim(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
|
|
|
|
if (error)
|
|
|
|
zio->io_error = error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Undo an allocation. This is used by zio_done() when an I/O fails
|
|
|
|
* and we want to give back the block we just allocated.
|
|
|
|
* This handles both normal blocks and gang blocks.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
|
|
|
|
{
|
2010-08-26 20:52:39 +04:00
|
|
|
int g;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (!BP_IS_HOLE(bp))
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (gn != NULL) {
|
2010-08-26 20:52:39 +04:00
|
|
|
for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_dva_unallocate(zio, gn->gn_child[g],
|
|
|
|
&gn->gn_gbh->zg_blkptr[g]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to allocate an intent log block. Return 0 on success, errno on failure.
|
|
|
|
*/
|
|
|
|
int
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
|
|
|
|
boolean_t use_slog)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
int error = 1;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(txg > spa_syncing_txg(spa));
|
|
|
|
|
2012-04-20 02:55:28 +04:00
|
|
|
/*
|
|
|
|
* ZIL blocks are always contiguous (i.e. not gang blocks) so we
|
|
|
|
* set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
|
|
|
|
* when allocating them.
|
|
|
|
*/
|
|
|
|
if (use_slog) {
|
2010-05-29 00:45:14 +04:00
|
|
|
error = metaslab_alloc(spa, spa_log_class(spa), size,
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
new_bp, 1, txg, NULL,
|
|
|
|
METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
|
2012-04-20 02:55:28 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2012-04-20 02:55:28 +04:00
|
|
|
if (error) {
|
2010-05-29 00:45:14 +04:00
|
|
|
error = metaslab_alloc(spa, spa_normal_class(spa), size,
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
new_bp, 1, txg, NULL,
|
2013-08-29 22:56:49 +04:00
|
|
|
METASLAB_FASTWRITE);
|
2012-04-20 02:55:28 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
BP_SET_LSIZE(new_bp, size);
|
|
|
|
BP_SET_PSIZE(new_bp, size);
|
|
|
|
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_CHECKSUM(new_bp,
|
|
|
|
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
|
|
|
|
? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
|
|
|
|
BP_SET_LEVEL(new_bp, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(new_bp, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Free an intent log block.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
|
|
|
void
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(!BP_IS_GANG(bp));
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free(spa, txg, bp);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Read and write to physical devices
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zio_vdev_io_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
uint64_t align;
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd == NULL) {
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
|
|
spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* The mirror_ops handle multiple DVAs in a single BP.
|
|
|
|
*/
|
|
|
|
return (vdev_mirror_ops.vdev_op_io_start(zio));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* We keep track of time-sensitive I/Os so that the scan thread
|
|
|
|
* can quickly react to certain workloads. In particular, we care
|
|
|
|
* about non-scrubbing, top-level reads and writes with the following
|
|
|
|
* characteristics:
|
|
|
|
* - synchronous writes of user data to non-slog devices
|
|
|
|
* - any reads of user data
|
|
|
|
* When these conditions are met, adjust the timestamp of spa_last_io
|
|
|
|
* which allows the scan thread to adjust its workload accordingly.
|
|
|
|
*/
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
|
|
|
|
vd == vd->vdev_top && !vd->vdev_islog &&
|
|
|
|
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
|
|
|
|
zio->io_txg != spa_syncing_txg(spa)) {
|
|
|
|
uint64_t old = spa->spa_last_io;
|
|
|
|
uint64_t new = ddi_get_lbolt64();
|
|
|
|
if (old != new)
|
|
|
|
(void) atomic_cas_64(&spa->spa_last_io, old, new);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
align = 1ULL << vd->vdev_top->vdev_ashift;
|
|
|
|
|
2014-09-23 03:42:03 +04:00
|
|
|
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
|
|
|
|
P2PHASE(zio->io_size, align) != 0) {
|
|
|
|
/* Transform logical writes to be a full physical block size. */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t asize = P2ROUNDUP(zio->io_size, align);
|
|
|
|
char *abuf = zio_buf_alloc(asize);
|
2012-10-25 02:22:31 +04:00
|
|
|
ASSERT(vd == vd->vdev_top);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE) {
|
|
|
|
bcopy(zio->io_data, abuf, zio->io_size);
|
|
|
|
bzero(abuf + zio->io_size, asize - zio->io_size);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-09-23 03:42:03 +04:00
|
|
|
/*
|
|
|
|
* If this is not a physical io, make sure that it is properly aligned
|
|
|
|
* before proceeding.
|
|
|
|
*/
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
|
|
|
|
ASSERT0(P2PHASE(zio->io_offset, align));
|
|
|
|
ASSERT0(P2PHASE(zio->io_size, align));
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For physical writes, we allow 512b aligned writes and assume
|
|
|
|
* the device will perform a read-modify-write as necessary.
|
|
|
|
*/
|
|
|
|
ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
|
|
|
|
ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
|
2009-01-16 00:59:39 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is a repair I/O, and there's no self-healing involved --
|
|
|
|
* that is, we're just resilvering what we expect to resilver --
|
|
|
|
* then don't do the I/O unless zio's txg is actually in vd's DTL.
|
|
|
|
* This prevents spurious resilvering with nested replication.
|
|
|
|
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
|
|
|
|
* A is out of date, we'll read from C+D, then use the data to
|
|
|
|
* resilver A+B -- but we don't actually want to resilver B, just A.
|
|
|
|
* The top-level mirror has no way to know this, so instead we just
|
|
|
|
* discard unnecessary repairs as we work our way down the vdev tree.
|
|
|
|
* The same logic applies to any form of nested replication:
|
|
|
|
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
|
|
|
|
*/
|
|
|
|
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
|
|
|
|
zio->io_txg != 0 && /* not a delegated i/o */
|
|
|
|
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
zio_vdev_io_bypass(zio);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
|
2009-02-18 23:51:31 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zio = vdev_queue_io(zio)) == NULL)
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
|
|
|
|
if (!vdev_accessible(vd, zio)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_interrupt(zio);
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (vd->vdev_ops->vdev_op_io_start(zio));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_vdev_io_done(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
|
|
|
|
boolean_t unexpected_error = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
|
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
|
|
|
|
vdev_queue_io_done(zio);
|
|
|
|
|
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE)
|
|
|
|
vdev_cache_write(zio);
|
|
|
|
|
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_error = zio_handle_device_injection(vd,
|
|
|
|
zio, EIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
|
|
|
zio->io_error = zio_handle_label_injection(zio, EIO);
|
|
|
|
|
|
|
|
if (zio->io_error) {
|
|
|
|
if (!vdev_accessible(vd, zio)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
unexpected_error = B_TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ops->vdev_op_io_done(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (unexpected_error)
|
2009-02-18 23:51:31 +03:00
|
|
|
VERIFY(vdev_probe(vd, zio) == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* For non-raidz ZIOs, we can just copy aside the bad data read from the
|
|
|
|
* disk, and use that to finish the checksum ereport later.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
|
|
|
|
const void *good_buf)
|
|
|
|
{
|
|
|
|
/* no processing needed */
|
|
|
|
zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
void
|
|
|
|
zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
|
|
|
|
{
|
|
|
|
void *buf = zio_buf_alloc(zio->io_size);
|
|
|
|
|
|
|
|
bcopy(zio->io_data, buf, zio->io_size);
|
|
|
|
|
|
|
|
zcr->zcr_cbinfo = zio->io_size;
|
|
|
|
zcr->zcr_cbdata = buf;
|
|
|
|
zcr->zcr_finish = zio_vsd_default_cksum_finish;
|
|
|
|
zcr->zcr_free = zio_buf_free;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
|
|
|
zio_vdev_io_assess(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *vd = zio->io_vd;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
|
|
|
|
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
|
|
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
|
|
|
|
|
|
|
|
if (zio->io_vsd != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_vsd_ops->vsd_free(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_vsd = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = zio_handle_fault_injection(zio, EIO);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the I/O failed, determine whether we should attempt to retry it.
|
2010-05-29 00:45:14 +04:00
|
|
|
*
|
|
|
|
* On retry, we cut in line in the issue queue, since we don't want
|
|
|
|
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error && vd == NULL &&
|
|
|
|
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_IO_RETRY |
|
|
|
|
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
|
|
|
|
zio_requeue_io_start_cut_in_line);
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If we got an error on a leaf device, convert it to ENXIO
|
|
|
|
* if the device is not accessible at all.
|
|
|
|
*/
|
|
|
|
if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
!vdev_accessible(vd, zio))
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we can't write to an interior vdev (mirror or RAID-Z),
|
|
|
|
* set vdev_cant_write so that we stop trying to allocate from it.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
|
2013-09-04 16:00:57 +04:00
|
|
|
vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
|
2008-12-03 23:09:06 +03:00
|
|
|
vd->vdev_cant_write = B_TRUE;
|
2013-09-04 16:00:57 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (zio->io_error)
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
zio->io_physdone != NULL) {
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
|
|
|
|
zio->io_physdone(zio->io_logical);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_reissue(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
|
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_redone(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_bypass(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
|
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
|
|
|
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Generate and verify checksums
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zio_checksum_generate(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-12-03 23:09:06 +03:00
|
|
|
enum zio_checksum checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp == NULL) {
|
|
|
|
/*
|
|
|
|
* This is zio_write_phys().
|
|
|
|
* We're either generating a label checksum, or none at all.
|
|
|
|
*/
|
|
|
|
checksum = zio->io_prop.zp_checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (checksum == ZIO_CHECKSUM_OFF)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
ASSERT(checksum == ZIO_CHECKSUM_LABEL);
|
|
|
|
} else {
|
|
|
|
if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
|
|
|
|
ASSERT(!IO_IS_ALLOCATING(zio));
|
|
|
|
checksum = ZIO_CHECKSUM_GANG_HEADER;
|
|
|
|
} else {
|
|
|
|
checksum = BP_GET_CHECKSUM(bp);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_verify(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_bad_cksum_t info;
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_vd != NULL);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp == NULL) {
|
|
|
|
/*
|
|
|
|
* This is zio_read_phys().
|
|
|
|
* We're either verifying a label checksum, or nothing at all.
|
|
|
|
*/
|
|
|
|
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((error = zio_checksum_error(zio, &info)) != 0) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_error = error;
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_ereport_start_checksum(zio->io_spa,
|
|
|
|
zio->io_vd, zio, zio->io_offset,
|
|
|
|
zio->io_size, NULL, &info);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called by RAID-Z to ensure we don't compute the checksum twice.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_checksum_verified(zio_t *zio)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
|
2014-06-06 01:19:08 +04:00
|
|
|
* An error of 0 indicates success. ENXIO indicates whole-device failure,
|
2008-12-03 23:09:06 +03:00
|
|
|
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
|
|
|
|
* indicate errors that are specific to one I/O, and most likely permanent.
|
|
|
|
* Any other error is presumed to be worse because we weren't expecting it.
|
|
|
|
* ==========================================================================
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
int
|
|
|
|
zio_worst_error(int e1, int e2)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
|
|
|
|
int r1, r2;
|
|
|
|
|
|
|
|
for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
|
|
|
|
if (e1 == zio_error_rank[r1])
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
|
|
|
|
if (e2 == zio_error_rank[r2])
|
|
|
|
break;
|
|
|
|
|
|
|
|
return (r1 > r2 ? e1 : e2);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* I/O completion
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_ready(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio, *pio_next;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
|
|
|
|
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
|
2009-07-03 02:44:48 +04:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_ready) {
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
2013-05-10 23:47:54 +04:00
|
|
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
|
|
|
|
(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_ready(zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp != NULL && bp != &zio->io_bp_copy)
|
|
|
|
zio->io_bp_copy = *bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error)
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_READY] = 1;
|
|
|
|
pio = zio_walk_parents(zio);
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As we notify zio's parents, new parents could be added.
|
|
|
|
* New parents go to the head of zio's io_parent_list, however,
|
|
|
|
* so we will (correctly) not notify them. The remainder of zio's
|
|
|
|
* io_parent_list, from 'pio_next' onward, cannot change because
|
|
|
|
* all parents must wait for us to be done before they can be done.
|
|
|
|
*/
|
|
|
|
for (; pio != NULL; pio = pio_next) {
|
|
|
|
pio_next = zio_walk_parents(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NODATA) {
|
|
|
|
if (BP_IS_GANG(bp)) {
|
|
|
|
zio->io_flags &= ~ZIO_FLAG_NODATA;
|
|
|
|
} else {
|
|
|
|
ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
|
|
|
|
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zio_injection_enabled &&
|
|
|
|
zio->io_spa->spa_syncing_txg == zio->io_txg)
|
|
|
|
zio_handle_ignored_writes(zio);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_done(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio, *pio_next;
|
2010-08-26 20:52:39 +04:00
|
|
|
int c, w;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* If our children haven't all completed,
|
2008-12-03 23:09:06 +03:00
|
|
|
* wait for them and then repeat this pipeline stage.
|
|
|
|
*/
|
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
|
|
|
|
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (c = 0; c < ZIO_CHILD_TYPES; c++)
|
|
|
|
for (w = 0; w < ZIO_WAIT_TYPES; w++)
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_children[c][w] == 0);
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(zio->io_bp->blk_pad[0] == 0);
|
|
|
|
ASSERT(zio->io_bp->blk_pad[1] == 0);
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
|
|
|
|
sizeof (blkptr_t)) == 0 ||
|
2010-08-26 22:04:17 +04:00
|
|
|
(zio->io_bp == zio_unique_parent(zio)->io_bp));
|
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_bp_override == NULL &&
|
2008-12-03 23:09:06 +03:00
|
|
|
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=,
|
|
|
|
BP_GET_NDVAS(zio->io_bp));
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
|
2013-11-01 23:26:11 +04:00
|
|
|
(BP_COUNT_GANG(zio->io_bp) ==
|
|
|
|
BP_GET_NDVAS(zio->io_bp)));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2013-05-10 23:47:54 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NOPWRITE)
|
|
|
|
VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* If there were child vdev/gang/ddt errors, they apply to us now.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
|
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the I/O on the transformed data was successful, generate any
|
|
|
|
* checksum reports now while we still have the transformed data.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == 0) {
|
|
|
|
while (zio->io_cksum_report != NULL) {
|
|
|
|
zio_cksum_report_t *zcr = zio->io_cksum_report;
|
|
|
|
uint64_t align = zcr->zcr_align;
|
2010-08-26 22:04:17 +04:00
|
|
|
uint64_t asize = P2ROUNDUP(zio->io_size, align);
|
2010-05-29 00:45:14 +04:00
|
|
|
char *abuf = zio->io_data;
|
|
|
|
|
2010-08-26 22:04:17 +04:00
|
|
|
if (asize != zio->io_size) {
|
2010-05-29 00:45:14 +04:00
|
|
|
abuf = zio_buf_alloc(asize);
|
2010-08-26 22:04:17 +04:00
|
|
|
bcopy(zio->io_data, abuf, zio->io_size);
|
2013-11-01 23:26:11 +04:00
|
|
|
bzero(abuf+zio->io_size, asize-zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
zio->io_cksum_report = zcr->zcr_next;
|
|
|
|
zcr->zcr_next = NULL;
|
|
|
|
zcr->zcr_finish(zcr, abuf);
|
|
|
|
zfs_ereport_free_checksum(zcr);
|
|
|
|
|
2010-08-26 22:04:17 +04:00
|
|
|
if (asize != zio->io_size)
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_buf_free(abuf, asize);
|
|
|
|
}
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio_pop_transforms(zio); /* note: may set zio->io_error */
|
|
|
|
|
2010-08-26 22:04:17 +04:00
|
|
|
vdev_stat_update(zio, zio->io_size);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-10-02 03:54:52 +04:00
|
|
|
/*
|
2013-04-30 02:49:23 +04:00
|
|
|
* If this I/O is attached to a particular vdev is slow, exceeding
|
2012-12-21 06:15:34 +04:00
|
|
|
* 30 seconds to complete, post an error described the I/O delay.
|
|
|
|
* We ignore these errors if the device is currently unavailable.
|
2010-10-02 03:54:52 +04:00
|
|
|
*/
|
2013-04-30 02:49:23 +04:00
|
|
|
if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
|
2012-12-21 06:15:34 +04:00
|
|
|
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
|
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
|
2013-11-01 23:26:11 +04:00
|
|
|
zio->io_vd, zio, 0, 0);
|
2012-12-21 06:15:34 +04:00
|
|
|
}
|
2010-10-02 03:54:52 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error) {
|
|
|
|
/*
|
|
|
|
* If this I/O is attached to a particular vdev,
|
|
|
|
* generate an error message describing the I/O failure
|
|
|
|
* at the block level. We ignore these errors if the
|
|
|
|
* device is currently unavailable.
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
|
|
|
|
!vdev_is_dead(zio->io_vd))
|
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
|
|
|
|
zio->io_vd, zio, 0, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zio->io_error == EIO || !(zio->io_flags &
|
|
|
|
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
|
2010-08-26 22:04:17 +04:00
|
|
|
zio == zio->io_logical) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* For logical I/O requests, tell the SPA to log the
|
|
|
|
* error and generate a logical data ereport.
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
spa_log_error(zio->io_spa, zio);
|
2013-11-01 23:26:11 +04:00
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
|
|
|
|
NULL, zio, 0, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:04:17 +04:00
|
|
|
if (zio->io_error && zio == zio->io_logical) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Determine whether zio should be reexecuted. This will
|
|
|
|
* propagate all the way to the root via zio_notify_parent().
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (IO_IS_ALLOCATING(zio) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error != ENOSPC)
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
|
|
|
|
else
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zio->io_type == ZIO_TYPE_READ ||
|
|
|
|
zio->io_type == ZIO_TYPE_FREE) &&
|
2010-08-27 01:24:34 +04:00
|
|
|
!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_error == ENXIO &&
|
2010-08-26 22:04:17 +04:00
|
|
|
spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
|
|
|
|
spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
|
|
|
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here is a possibly good place to attempt to do
|
|
|
|
* either combinatorial reconstruction or error correction
|
|
|
|
* based on checksums. It also might be a good place
|
|
|
|
* to send out preliminary ereports before we suspend
|
|
|
|
* processing.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* If there were logical child errors, they apply to us now.
|
|
|
|
* We defer this until now to avoid conflating logical child
|
|
|
|
* errors with errors that happened to the zio itself when
|
|
|
|
* updating vdev stats and reporting FMA events above.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zio->io_error || zio->io_reexecute) &&
|
|
|
|
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
|
2013-05-10 23:47:54 +04:00
|
|
|
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
|
2010-08-26 22:04:17 +04:00
|
|
|
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
zio_gang_tree_free(&zio->io_gang_tree);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Godfather I/Os should never suspend.
|
|
|
|
*/
|
|
|
|
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
|
|
|
|
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
|
|
|
|
zio->io_reexecute = 0;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_reexecute) {
|
|
|
|
/*
|
|
|
|
* This is a logical I/O that wants to reexecute.
|
|
|
|
*
|
|
|
|
* Reexecute is top-down. When an i/o fails, if it's not
|
|
|
|
* the root, it simply notifies its parent and sticks around.
|
|
|
|
* The parent, seeing that it still has children in zio_done(),
|
|
|
|
* does the same. This percolates all the way up to the root.
|
|
|
|
* The root i/o will reexecute or suspend the entire tree.
|
|
|
|
*
|
|
|
|
* This approach ensures that zio_reexecute() honors
|
|
|
|
* all the original i/o dependency relationships, e.g.
|
|
|
|
* parents not executing until children are ready.
|
|
|
|
*/
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_gang_leader = NULL;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = 1;
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* "The Godfather" I/O monitors its children but is
|
|
|
|
* not a true parent to them. It will track them through
|
|
|
|
* the pipeline but severs its ties whenever they get into
|
|
|
|
* trouble (e.g. suspended). This allows "The Godfather"
|
|
|
|
* I/O to return status without blocking.
|
|
|
|
*/
|
|
|
|
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
|
|
|
|
zio_link_t *zl = zio->io_walk_link;
|
|
|
|
pio_next = zio_walk_parents(zio);
|
|
|
|
|
|
|
|
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
|
|
|
|
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
|
|
|
|
zio_remove_child(pio, zio, zl);
|
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
if ((pio = zio_unique_parent(zio)) != NULL) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We're not a root i/o, so there's nothing to do
|
|
|
|
* but notify our parent. Don't propagate errors
|
|
|
|
* upward since we haven't permanently failed yet.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
|
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
|
|
|
|
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
|
|
|
|
/*
|
|
|
|
* We'd fail again if we reexecuted now, so suspend
|
|
|
|
* until conditions improve (e.g. device comes online).
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
zio_suspend(zio->io_spa, zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Reexecution is potentially a huge amount of work.
|
|
|
|
* Hand it off to the otherwise-unused claim taskq.
|
|
|
|
*/
|
2011-11-08 04:26:52 +04:00
|
|
|
ASSERT(taskq_empty_ent(&zio->io_tqent));
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskq_dispatch_ent(zio->io_spa,
|
|
|
|
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
|
2011-11-08 04:26:52 +04:00
|
|
|
(task_func_t *)zio_reexecute, zio, 0,
|
|
|
|
&zio->io_tqent);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_count == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_reexecute == 0);
|
|
|
|
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Report any checksum errors, since the I/O is complete.
|
|
|
|
*/
|
|
|
|
while (zio->io_cksum_report != NULL) {
|
|
|
|
zio_cksum_report_t *zcr = zio->io_cksum_report;
|
|
|
|
zio->io_cksum_report = zcr->zcr_next;
|
|
|
|
zcr->zcr_next = NULL;
|
|
|
|
zcr->zcr_finish(zcr, NULL);
|
|
|
|
zfs_ereport_free_checksum(zcr);
|
|
|
|
}
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
|
2014-06-06 01:19:08 +04:00
|
|
|
!BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
/*
|
|
|
|
* It is the responsibility of the done callback to ensure that this
|
|
|
|
* particular zio is no longer discoverable for adoption, and as
|
|
|
|
* such, cannot acquire any new parents.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_done)
|
|
|
|
zio->io_done(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = 1;
|
|
|
|
mutex_exit(&zio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
|
|
|
|
zio_link_t *zl = zio->io_walk_link;
|
|
|
|
pio_next = zio_walk_parents(zio);
|
|
|
|
zio_remove_child(pio, zio, zl);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_waiter != NULL) {
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_executor = NULL;
|
|
|
|
cv_broadcast(&zio->io_cv);
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
} else {
|
|
|
|
zio_destroy(zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
* I/O pipeline definition
|
|
|
|
* ==========================================================================
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static zio_pipe_stage_t *zio_pipeline[] = {
|
2008-12-03 23:09:06 +03:00
|
|
|
NULL,
|
|
|
|
zio_read_bp_init,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_bp_init,
|
|
|
|
zio_issue_async,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_bp_init,
|
|
|
|
zio_checksum_generate,
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_nop_write,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_read_start,
|
|
|
|
zio_ddt_read_done,
|
|
|
|
zio_ddt_write,
|
|
|
|
zio_ddt_free,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_assemble,
|
|
|
|
zio_gang_issue,
|
|
|
|
zio_dva_allocate,
|
|
|
|
zio_dva_free,
|
|
|
|
zio_dva_claim,
|
|
|
|
zio_ready,
|
|
|
|
zio_vdev_io_start,
|
|
|
|
zio_vdev_io_done,
|
|
|
|
zio_vdev_io_assess,
|
|
|
|
zio_checksum_verify,
|
|
|
|
zio_done
|
|
|
|
};
|
2010-08-26 22:49:16 +04:00
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
/* dnp is the dnode for zb1->zb_object */
|
|
|
|
boolean_t
|
2014-06-25 22:37:59 +04:00
|
|
|
zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
|
|
|
|
const zbookmark_phys_t *zb2)
|
2012-12-14 03:24:15 +04:00
|
|
|
{
|
|
|
|
uint64_t zb1nextL0, zb2thisobj;
|
|
|
|
|
|
|
|
ASSERT(zb1->zb_objset == zb2->zb_objset);
|
|
|
|
ASSERT(zb2->zb_level == 0);
|
|
|
|
|
|
|
|
/* The objset_phys_t isn't before anything. */
|
|
|
|
if (dnp == NULL)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
zb1nextL0 = (zb1->zb_blkid + 1) <<
|
|
|
|
((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
|
|
|
|
|
|
|
|
zb2thisobj = zb2->zb_object ? zb2->zb_object :
|
|
|
|
zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
|
|
|
|
|
|
|
|
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
|
|
|
uint64_t nextobj = zb1nextL0 *
|
|
|
|
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
|
|
|
|
return (nextobj <= zb2thisobj);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zb1->zb_object < zb2thisobj)
|
|
|
|
return (B_TRUE);
|
|
|
|
if (zb1->zb_object > zb2thisobj)
|
|
|
|
return (B_FALSE);
|
|
|
|
if (zb2->zb_object == DMU_META_DNODE_OBJECT)
|
|
|
|
return (B_FALSE);
|
|
|
|
return (zb1nextL0 <= zb2->zb_blkid);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:49:16 +04:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
|
|
|
EXPORT_SYMBOL(zio_type_name);
|
2014-12-16 22:44:24 +03:00
|
|
|
EXPORT_SYMBOL(zio_buf_alloc);
|
|
|
|
EXPORT_SYMBOL(zio_data_buf_alloc);
|
|
|
|
EXPORT_SYMBOL(zio_buf_free);
|
|
|
|
EXPORT_SYMBOL(zio_data_buf_free);
|
2010-08-26 22:49:16 +04:00
|
|
|
|
2010-10-02 03:54:52 +04:00
|
|
|
module_param(zio_delay_max, int, 0644);
|
2011-05-04 02:09:28 +04:00
|
|
|
MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
|
|
|
|
|
|
|
|
module_param(zio_requeue_io_start_cut_in_line, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
|
|
|
module_param(zfs_sync_pass_deferred_free, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Defer frees starting in this pass");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
|
|
|
module_param(zfs_sync_pass_dont_compress, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Don't compress starting in this pass");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
|
|
|
module_param(zfs_sync_pass_rewrite, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_sync_pass_rewrite,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Rewrite new bps starting in this pass");
|
2010-08-26 22:49:16 +04:00
|
|
|
#endif
|