2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2020-09-04 20:34:28 +03:00
|
|
|
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
|
2011-11-08 04:26:52 +04:00
|
|
|
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
|
2018-09-06 04:33:36 +03:00
|
|
|
* Copyright (c) 2017, Intel Corporation.
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 20:10:17 +03:00
|
|
|
* Copyright (c) 2019, Klara Inc.
|
|
|
|
* Copyright (c) 2019, Allan Jude
|
2021-04-16 21:00:53 +03:00
|
|
|
* Copyright (c) 2021, Datto, Inc.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
2014-11-03 23:15:08 +03:00
|
|
|
#include <sys/sysmacros.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/fm/fs/zfs.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/txg.h>
|
|
|
|
#include <sys/spa_impl.h>
|
|
|
|
#include <sys/vdev_impl.h>
|
2019-03-29 19:13:20 +03:00
|
|
|
#include <sys/vdev_trim.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zio_impl.h>
|
|
|
|
#include <sys/zio_compress.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/dmu_objset.h>
|
|
|
|
#include <sys/arc.h>
|
|
|
|
#include <sys/ddt.h>
|
2014-06-06 01:19:08 +04:00
|
|
|
#include <sys/blkptr.h>
|
2013-12-09 22:37:51 +04:00
|
|
|
#include <sys/zfeature.h>
|
2017-11-16 04:27:01 +03:00
|
|
|
#include <sys/dsl_scan.h>
|
2016-10-14 03:59:18 +03:00
|
|
|
#include <sys/metaslab_impl.h>
|
2016-02-29 21:05:23 +03:00
|
|
|
#include <sys/time.h>
|
Enable use of DTRACE_PROBE* macros in "spl" module
This change modifies some of the infrastructure for enabling the use of
the DTRACE_PROBE* macros, such that we can use tehm in the "spl" module.
Currently, when the DTRACE_PROBE* macros are used, they get expanded to
create new functions, and these dynamically generated functions become
part of the "zfs" module.
Since the "spl" module does not depend on the "zfs" module, the use of
DTRACE_PROBE* in the "spl" module would result in undefined symbols
being used in the "spl" module. Specifically, DTRACE_PROBE* would turn
into a function call, and the function being called would be a symbol
only contained in the "zfs" module; which results in a linker and/or
runtime error.
Thus, this change adds the necessary logic to the "spl" module, to
mirror the tracing functionality available to the "zfs" module. After
this change, we'll have a "trace_zfs.h" header file which defines the
probes available only to the "zfs" module, and a "trace_spl.h" header
file which defines the probes available only to the "spl" module.
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Prakash Surya <prakash.surya@delphix.com>
Closes #9525
2019-10-30 21:02:41 +03:00
|
|
|
#include <sys/trace_zfs.h>
|
2016-07-22 18:52:49 +03:00
|
|
|
#include <sys/abd.h>
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#include <sys/dsl_crypt.h>
|
2020-03-27 19:11:22 +03:00
|
|
|
#include <cityhash.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O type descriptions
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
const char *zio_type_name[ZIO_TYPES] = {
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Note: Linux kernel thread name length is limited
|
|
|
|
* so these names will differ from upstream open zfs.
|
|
|
|
*/
|
2019-03-29 19:13:20 +03:00
|
|
|
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
|
2010-05-29 00:45:14 +04:00
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-12-08 23:57:42 +03:00
|
|
|
int zio_dva_throttle_enabled = B_TRUE;
|
2019-02-15 23:44:24 +03:00
|
|
|
int zio_deadman_log_all = B_FALSE;
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O kmem caches
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
kmem_cache_t *zio_cache;
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_t *zio_link_cache;
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
#endif
|
|
|
|
|
2018-11-09 03:47:24 +03:00
|
|
|
/* Mark IOs as "slow" if they take longer than 30 seconds */
|
|
|
|
int zio_slow_io_ms = (30 * MILLISEC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
#define BP_SPANB(indblkshift, level) \
|
|
|
|
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
|
|
|
#define COMPARE_META_LEVEL 0x80000000ul
|
2013-05-06 21:14:52 +04:00
|
|
|
/*
|
|
|
|
* The following actions directly effect the spa's sync-to-convergence logic.
|
|
|
|
* The values below define the sync pass when we start performing the action.
|
|
|
|
* Care should be taken when changing these values as they directly impact
|
|
|
|
* spa_sync() performance. Tuning these values may introduce subtle performance
|
|
|
|
* pathologies and should only be done in the context of performance analysis.
|
|
|
|
* These tunables will eventually be removed and replaced with #defines once
|
|
|
|
* enough analysis has been done to determine optimal values.
|
|
|
|
*
|
|
|
|
* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
|
|
|
|
* regular blocks are not deferred.
|
2019-06-13 23:10:19 +03:00
|
|
|
*
|
|
|
|
* Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
|
|
|
|
* compression (including of metadata). In practice, we don't have this
|
|
|
|
* many sync passes, so this has no effect.
|
|
|
|
*
|
|
|
|
* The original intent was that disabling compression would help the sync
|
|
|
|
* passes to converge. However, in practice disabling compression increases
|
|
|
|
* the average number of sync passes, because when we turn compression off, a
|
|
|
|
* lot of block's size will change and thus we have to re-allocate (not
|
|
|
|
* overwrite) them. It also increases the number of 128KB allocations (e.g.
|
|
|
|
* for indirect blocks and spacemaps) because these will not be compressed.
|
|
|
|
* The 128K allocations are especially detrimental to performance on highly
|
|
|
|
* fragmented systems, which may have very few free segments of this size,
|
|
|
|
* and may need to load new metaslabs to satisfy 128K allocations.
|
2013-05-06 21:14:52 +04:00
|
|
|
*/
|
|
|
|
int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
|
2019-06-13 23:10:19 +03:00
|
|
|
int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
|
2013-05-06 21:14:52 +04:00
|
|
|
int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* An allocating zio is one that either currently has the DVA allocate
|
|
|
|
* stage set or will have it later in its lifetime.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
|
|
|
|
|
2019-12-09 23:29:56 +03:00
|
|
|
/*
|
|
|
|
* Enable smaller cores by excluding metadata
|
|
|
|
* allocations as well.
|
|
|
|
*/
|
|
|
|
int zio_exclude_metadata = 0;
|
2011-05-04 02:09:28 +04:00
|
|
|
int zio_requeue_io_start_cut_in_line = 1;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
int zio_buf_debug_limit = 16384;
|
|
|
|
#else
|
|
|
|
int zio_buf_debug_limit = 0;
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
static inline void __zio_execute(zio_t *zio);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
|
|
|
zio_init(void)
|
|
|
|
{
|
|
|
|
size_t c;
|
|
|
|
|
2015-01-30 22:25:19 +03:00
|
|
|
zio_cache = kmem_cache_create("zio_cache",
|
|
|
|
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_link_cache = kmem_cache_create("zio_link_cache",
|
2014-05-15 05:17:39 +04:00
|
|
|
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For small buffers, we want a cache for each multiple of
|
2014-11-03 23:15:08 +03:00
|
|
|
* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
|
|
|
|
* for each quarter-power of 2.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
|
|
|
|
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
|
|
|
|
size_t p2 = size;
|
|
|
|
size_t align = 0;
|
2019-12-09 23:29:56 +03:00
|
|
|
size_t data_cflags, cflags;
|
|
|
|
|
|
|
|
data_cflags = KMC_NODEBUG;
|
|
|
|
cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
|
|
|
|
KMC_NODEBUG : 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-05 07:54:48 +03:00
|
|
|
#if defined(_ILP32) && defined(_KERNEL)
|
2014-11-03 23:15:08 +03:00
|
|
|
/*
|
|
|
|
* Cache size limited to 1M on 32-bit platforms until ARC
|
|
|
|
* buffers no longer require virtual address space.
|
|
|
|
*/
|
|
|
|
if (size > zfs_max_recordsize)
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
while (!ISP2(p2))
|
2008-11-20 23:01:55 +03:00
|
|
|
p2 &= p2 - 1;
|
|
|
|
|
2013-05-17 01:18:06 +04:00
|
|
|
#ifndef _KERNEL
|
|
|
|
/*
|
|
|
|
* If we are using watchpoints, put each buffer on its own page,
|
|
|
|
* to eliminate the performance overhead of trapping to the
|
|
|
|
* kernel when modifying a non-watched buffer that shares the
|
|
|
|
* page with a watched buffer.
|
|
|
|
*/
|
|
|
|
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
|
|
|
|
continue;
|
2016-06-29 23:59:51 +03:00
|
|
|
/*
|
|
|
|
* Here's the problem - on 4K native devices in userland on
|
|
|
|
* Linux using O_DIRECT, buffers must be 4K aligned or I/O
|
|
|
|
* will fail with EINVAL, causing zdb (and others) to coredump.
|
|
|
|
* Since userland probably doesn't need optimized buffer caches,
|
|
|
|
* we just force 4K alignment on everything.
|
|
|
|
*/
|
|
|
|
align = 8 * SPA_MINBLOCKSIZE;
|
|
|
|
#else
|
2017-05-02 20:04:30 +03:00
|
|
|
if (size < PAGESIZE) {
|
2008-11-20 23:01:55 +03:00
|
|
|
align = SPA_MINBLOCKSIZE;
|
2013-05-17 01:18:06 +04:00
|
|
|
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
|
2017-05-02 20:04:30 +03:00
|
|
|
align = PAGESIZE;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2016-06-29 23:59:51 +03:00
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (align != 0) {
|
|
|
|
char name[36];
|
2021-04-17 22:36:04 +03:00
|
|
|
if (cflags == data_cflags) {
|
|
|
|
/*
|
|
|
|
* Resulting kmem caches would be identical.
|
|
|
|
* Save memory by creating only one.
|
|
|
|
*/
|
|
|
|
(void) snprintf(name, sizeof (name),
|
|
|
|
"zio_buf_comb_%lu", (ulong_t)size);
|
|
|
|
zio_buf_cache[c] = kmem_cache_create(name,
|
|
|
|
size, align, NULL, NULL, NULL, NULL, NULL,
|
|
|
|
cflags);
|
|
|
|
zio_data_buf_cache[c] = zio_buf_cache[c];
|
|
|
|
continue;
|
|
|
|
}
|
2020-06-07 21:42:12 +03:00
|
|
|
(void) snprintf(name, sizeof (name), "zio_buf_%lu",
|
|
|
|
(ulong_t)size);
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_buf_cache[c] = kmem_cache_create(name, size,
|
2015-02-07 00:37:02 +03:00
|
|
|
align, NULL, NULL, NULL, NULL, NULL, cflags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2020-06-07 21:42:12 +03:00
|
|
|
(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
|
|
|
|
(ulong_t)size);
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_data_buf_cache[c] = kmem_cache_create(name, size,
|
2020-11-12 04:11:32 +03:00
|
|
|
align, NULL, NULL, NULL, NULL, NULL, data_cflags);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (--c != 0) {
|
|
|
|
ASSERT(zio_buf_cache[c] != NULL);
|
|
|
|
if (zio_buf_cache[c - 1] == NULL)
|
|
|
|
zio_buf_cache[c - 1] = zio_buf_cache[c];
|
|
|
|
|
|
|
|
ASSERT(zio_data_buf_cache[c] != NULL);
|
|
|
|
if (zio_data_buf_cache[c - 1] == NULL)
|
|
|
|
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_inject_init();
|
2013-01-23 13:54:30 +04:00
|
|
|
|
|
|
|
lz4_init();
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_fini(void)
|
|
|
|
{
|
2021-04-17 22:36:04 +03:00
|
|
|
size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
2021-04-17 22:36:04 +03:00
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
|
2016-07-22 18:52:49 +03:00
|
|
|
(void) printf("zio_fini: [%d] %llu != %llu\n",
|
2021-04-17 22:36:04 +03:00
|
|
|
(int)((i + 1) << SPA_MINBLOCKSHIFT),
|
|
|
|
(long long unsigned)zio_buf_cache_allocs[i],
|
|
|
|
(long long unsigned)zio_buf_cache_frees[i]);
|
|
|
|
}
|
2014-11-03 23:15:08 +03:00
|
|
|
#endif
|
2021-04-17 22:36:04 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The same kmem cache can show up multiple times in both zio_buf_cache
|
|
|
|
* and zio_data_buf_cache. Do a wasteful but trivially correct scan to
|
|
|
|
* sort it out.
|
|
|
|
*/
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
kmem_cache_t *cache = zio_buf_cache[i];
|
|
|
|
if (cache == NULL)
|
|
|
|
continue;
|
|
|
|
for (size_t j = i; j < n; j++) {
|
|
|
|
if (cache == zio_buf_cache[j])
|
|
|
|
zio_buf_cache[j] = NULL;
|
|
|
|
if (cache == zio_data_buf_cache[j])
|
|
|
|
zio_data_buf_cache[j] = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2021-04-17 22:36:04 +03:00
|
|
|
kmem_cache_destroy(cache);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2021-04-17 22:36:04 +03:00
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
kmem_cache_t *cache = zio_data_buf_cache[i];
|
|
|
|
if (cache == NULL)
|
|
|
|
continue;
|
|
|
|
for (size_t j = i; j < n; j++) {
|
|
|
|
if (cache == zio_data_buf_cache[j])
|
|
|
|
zio_data_buf_cache[j] = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2021-04-17 22:36:04 +03:00
|
|
|
kmem_cache_destroy(cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
VERIFY3P(zio_buf_cache[i], ==, NULL);
|
|
|
|
VERIFY3P(zio_data_buf_cache[i], ==, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_destroy(zio_link_cache);
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_destroy(zio_cache);
|
|
|
|
|
|
|
|
zio_inject_fini();
|
2013-01-23 13:54:30 +04:00
|
|
|
|
|
|
|
lz4_fini();
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Allocate and free I/O buffers
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
|
|
|
|
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
|
|
|
|
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
|
|
|
|
* excess / transient data in-core during a crashdump.
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zio_buf_alloc(size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
atomic_add_64(&zio_buf_cache_allocs[c], 1);
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-12-03 22:56:32 +03:00
|
|
|
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
|
|
|
|
* crashdump if the kernel panics. This exists so that we will limit the amount
|
|
|
|
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
|
|
|
|
* of kernel heap dumped to disk when the kernel panics)
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zio_data_buf_alloc(size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-12-03 22:56:32 +03:00
|
|
|
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_buf_free(void *buf, size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
atomic_add_64(&zio_buf_cache_frees[c], 1);
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
kmem_cache_free(zio_buf_cache[c], buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_data_buf_free(void *buf, size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
kmem_cache_free(zio_data_buf_cache[c], buf);
|
|
|
|
}
|
|
|
|
|
2017-01-05 22:10:07 +03:00
|
|
|
static void
|
|
|
|
zio_abd_free(void *abd, size_t size)
|
|
|
|
{
|
|
|
|
abd_free((abd_t *)abd);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Push and pop I/O transform buffers
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2016-06-02 07:04:53 +03:00
|
|
|
void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
|
2017-01-12 20:42:11 +03:00
|
|
|
zio_transform_func_t *transform)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2014-11-21 03:09:39 +03:00
|
|
|
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zt->zt_orig_abd = zio->io_abd;
|
2008-12-03 23:09:06 +03:00
|
|
|
zt->zt_orig_size = zio->io_size;
|
2008-11-20 23:01:55 +03:00
|
|
|
zt->zt_bufsize = bufsize;
|
2008-12-03 23:09:06 +03:00
|
|
|
zt->zt_transform = transform;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zt->zt_next = zio->io_transform_stack;
|
|
|
|
zio->io_transform_stack = zt;
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_abd = data;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_size = size;
|
|
|
|
}
|
|
|
|
|
2016-06-02 07:04:53 +03:00
|
|
|
void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_pop_transforms(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_transform_t *zt;
|
|
|
|
|
|
|
|
while ((zt = zio->io_transform_stack) != NULL) {
|
|
|
|
if (zt->zt_transform != NULL)
|
|
|
|
zt->zt_transform(zio,
|
2016-07-22 18:52:49 +03:00
|
|
|
zt->zt_orig_abd, zt->zt_orig_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zt->zt_bufsize != 0)
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(zio->io_abd);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_abd = zt->zt_orig_abd;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_size = zt->zt_orig_size;
|
|
|
|
zio->io_transform_stack = zt->zt_next;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
kmem_free(zt, sizeof (zio_transform_t));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* I/O transform callbacks for subblocks, decompression, and decryption
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
ASSERT(zio->io_size > size);
|
|
|
|
|
|
|
|
if (zio->io_type == ZIO_TYPE_READ)
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_copy(data, zio->io_abd, size);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2016-07-22 18:52:49 +03:00
|
|
|
if (zio->io_error == 0) {
|
|
|
|
void *tmp = abd_borrow_buf(data, size);
|
|
|
|
int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 20:10:17 +03:00
|
|
|
zio->io_abd, tmp, zio->io_size, size,
|
|
|
|
&zio->io_prop.zp_complevel);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_return_buf_copy(data, tmp, size);
|
|
|
|
|
2018-08-29 21:33:33 +03:00
|
|
|
if (zio_injection_enabled && ret == 0)
|
|
|
|
ret = zio_handle_fault_injection(zio, EINVAL);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
if (ret != 0)
|
|
|
|
zio->io_error = SET_ERROR(EIO);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
static void
|
|
|
|
zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
void *tmp;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2017-11-08 22:12:59 +03:00
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
uint64_t dsobj = zio->io_bookmark.zb_objset;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
uint64_t lsize = BP_GET_LSIZE(bp);
|
|
|
|
dmu_object_type_t ot = BP_GET_TYPE(bp);
|
|
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
boolean_t no_crypt = B_FALSE;
|
|
|
|
|
|
|
|
ASSERT(BP_USES_CRYPT(bp));
|
|
|
|
ASSERT3U(size, !=, 0);
|
|
|
|
|
|
|
|
if (zio->io_error != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify the cksum of MACs stored in an indirect bp. It will always
|
|
|
|
* be possible to verify this since it does not require an encryption
|
|
|
|
* key.
|
|
|
|
*/
|
|
|
|
if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
|
|
|
|
zio_crypt_decode_mac_bp(bp, mac);
|
|
|
|
|
|
|
|
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
|
|
|
|
/*
|
|
|
|
* We haven't decompressed the data yet, but
|
|
|
|
* zio_crypt_do_indirect_mac_checksum() requires
|
|
|
|
* decompressed data to be able to parse out the MACs
|
|
|
|
* from the indirect block. We decompress it now and
|
|
|
|
* throw away the result after we are finished.
|
|
|
|
*/
|
|
|
|
tmp = zio_buf_alloc(lsize);
|
|
|
|
ret = zio_decompress_data(BP_GET_COMPRESS(bp),
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 20:10:17 +03:00
|
|
|
zio->io_abd, tmp, zio->io_size, lsize,
|
|
|
|
&zio->io_prop.zp_complevel);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (ret != 0) {
|
|
|
|
ret = SET_ERROR(EIO);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
|
|
|
|
tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
|
|
|
|
zio_buf_free(tmp, lsize);
|
|
|
|
} else {
|
|
|
|
ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
|
|
|
|
zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
|
|
|
|
}
|
|
|
|
abd_copy(data, zio->io_abd, size);
|
|
|
|
|
2018-05-03 01:36:20 +03:00
|
|
|
if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
|
|
|
|
ret = zio_handle_decrypt_injection(spa,
|
|
|
|
&zio->io_bookmark, ot, ECKSUM);
|
|
|
|
}
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is an authenticated block, just check the MAC. It would be
|
|
|
|
* nice to separate this out into its own flag, but for the moment
|
|
|
|
* enum zio_flag is out of bits.
|
|
|
|
*/
|
|
|
|
if (BP_IS_AUTHENTICATED(bp)) {
|
|
|
|
if (ot == DMU_OT_OBJSET) {
|
2017-11-08 22:12:59 +03:00
|
|
|
ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
|
|
|
|
dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
} else {
|
|
|
|
zio_crypt_decode_mac_bp(bp, mac);
|
2017-11-08 22:12:59 +03:00
|
|
|
ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
|
|
|
|
zio->io_abd, size, mac);
|
2018-05-03 01:36:20 +03:00
|
|
|
if (zio_injection_enabled && ret == 0) {
|
|
|
|
ret = zio_handle_decrypt_injection(spa,
|
|
|
|
&zio->io_bookmark, ot, ECKSUM);
|
|
|
|
}
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
abd_copy(data, zio->io_abd, size);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_crypt_decode_params_bp(bp, salt, iv);
|
|
|
|
|
|
|
|
if (ot == DMU_OT_INTENT_LOG) {
|
|
|
|
tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
|
|
|
|
zio_crypt_decode_mac_zil(tmp, mac);
|
|
|
|
abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
|
|
|
|
} else {
|
|
|
|
zio_crypt_decode_mac_bp(bp, mac);
|
|
|
|
}
|
|
|
|
|
2018-05-03 01:36:20 +03:00
|
|
|
ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
|
|
|
|
BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
|
|
|
|
zio->io_abd, &no_crypt);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (no_crypt)
|
|
|
|
abd_copy(data, zio->io_abd, size);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
error:
|
|
|
|
/* assert that the key was found unless this was speculative */
|
2018-05-03 01:36:20 +03:00
|
|
|
ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If there was a decryption / authentication error return EIO as
|
|
|
|
* the io_error. If this was not a speculative zio, create an ereport.
|
|
|
|
*/
|
|
|
|
if (ret == ECKSUM) {
|
2018-03-31 21:12:51 +03:00
|
|
|
zio->io_error = SET_ERROR(EIO);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
|
2018-05-03 01:36:20 +03:00
|
|
|
spa_log_error(spa, &zio->io_bookmark);
|
2020-09-01 05:35:11 +03:00
|
|
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
|
2020-09-04 20:34:28 +03:00
|
|
|
spa, NULL, &zio->io_bookmark, zio, 0);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
zio->io_error = ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O parent/child relationships and pipeline interlocks
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_walk_parents(zio_t *cio, zio_link_t **zl)
|
2009-02-18 23:51:31 +03:00
|
|
|
{
|
|
|
|
list_t *pl = &cio->io_parent_list;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
|
|
|
|
if (*zl == NULL)
|
2009-02-18 23:51:31 +03:00
|
|
|
return (NULL);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT((*zl)->zl_child == cio);
|
|
|
|
return ((*zl)->zl_parent);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_walk_children(zio_t *pio, zio_link_t **zl)
|
2009-02-18 23:51:31 +03:00
|
|
|
{
|
|
|
|
list_t *cl = &pio->io_child_list;
|
|
|
|
|
2017-12-21 20:13:06 +03:00
|
|
|
ASSERT(MUTEX_HELD(&pio->io_lock));
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
|
|
|
|
if (*zl == NULL)
|
2009-02-18 23:51:31 +03:00
|
|
|
return (NULL);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT((*zl)->zl_parent == pio);
|
|
|
|
return ((*zl)->zl_child);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_unique_parent(zio_t *cio)
|
|
|
|
{
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
zio_t *pio = zio_walk_parents(cio, &zl);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
|
2009-02-18 23:51:31 +03:00
|
|
|
return (pio);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_add_child(zio_t *pio, zio_t *cio)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2014-11-21 03:09:39 +03:00
|
|
|
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Logical I/Os can have logical, gang, or vdev children.
|
|
|
|
* Gang I/Os can have gang or vdev children.
|
|
|
|
* Vdev I/Os can only have vdev children.
|
|
|
|
* The following ASSERT captures all of these constraints.
|
|
|
|
*/
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
zl->zl_parent = pio;
|
|
|
|
zl->zl_child = cio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&cio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
|
|
|
|
|
|
|
|
list_insert_head(&pio->io_child_list, zl);
|
|
|
|
list_insert_head(&cio->io_parent_list, zl);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
pio->io_child_count++;
|
|
|
|
cio->io_parent_count++;
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&cio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
ASSERT(zl->zl_parent == pio);
|
|
|
|
ASSERT(zl->zl_child == cio);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&cio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
list_remove(&pio->io_child_list, zl);
|
|
|
|
list_remove(&cio->io_parent_list, zl);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
pio->io_child_count--;
|
|
|
|
cio->io_parent_count--;
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&cio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_free(zio_link_cache, zl);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
boolean_t waiting = B_FALSE;
|
|
|
|
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
ASSERT(zio->io_stall == NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
|
|
|
|
if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
uint64_t *countp = &zio->io_children[c][wait];
|
|
|
|
if (*countp != 0) {
|
|
|
|
zio->io_stage >>= 1;
|
|
|
|
ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
|
|
|
|
zio->io_stall = countp;
|
|
|
|
waiting = B_TRUE;
|
|
|
|
break;
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
return (waiting);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:58:00 +04:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline void
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
|
|
|
|
zio_t **next_to_executep)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
|
|
|
|
int *errorp = &pio->io_child_error[zio->io_child_type];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
|
|
|
|
*errorp = zio_worst_error(*errorp, zio->io_error);
|
|
|
|
pio->io_reexecute |= zio->io_reexecute;
|
|
|
|
ASSERT3U(*countp, >, 0);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
|
|
|
|
(*countp)--;
|
|
|
|
|
|
|
|
if (*countp == 0 && pio->io_stall == countp) {
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_taskq_type_t type =
|
|
|
|
pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
|
|
|
|
ZIO_TASKQ_INTERRUPT;
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_stall = NULL;
|
|
|
|
mutex_exit(&pio->io_lock);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
* If we can tell the caller to execute this parent next, do
|
|
|
|
* so. Otherwise dispatch the parent zio as its own task.
|
|
|
|
*
|
|
|
|
* Having the caller execute the parent when possible reduces
|
|
|
|
* locking on the zio taskq's, reduces context switch
|
|
|
|
* overhead, and has no recursion penalty. Note that one
|
|
|
|
* read from disk typically causes at least 3 zio's: a
|
|
|
|
* zio_null(), the logical zio_read(), and then a physical
|
|
|
|
* zio. When the physical ZIO completes, we are able to call
|
|
|
|
* zio_done() on all 3 of these zio's from one invocation of
|
|
|
|
* zio_execute() by returning the parent back to
|
|
|
|
* zio_execute(). Since the parent isn't executed until this
|
|
|
|
* thread returns back to zio_execute(), the caller should do
|
|
|
|
* so promptly.
|
|
|
|
*
|
|
|
|
* In other cases, dispatching the parent prevents
|
|
|
|
* overflowing the stack when we have deeply nested
|
|
|
|
* parent-child relationships, as we do with the "mega zio"
|
|
|
|
* of writes for spa_sync(), and the chain of ZIL blocks.
|
2016-10-14 03:59:18 +03:00
|
|
|
*/
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
if (next_to_executep != NULL && *next_to_executep == NULL) {
|
|
|
|
*next_to_executep = pio;
|
|
|
|
} else {
|
|
|
|
zio_taskq_dispatch(pio, type, B_FALSE);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
mutex_exit(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_inherit_child_errors(zio_t *zio, enum zio_child c)
|
|
|
|
{
|
|
|
|
if (zio->io_child_error[c] != 0 && zio->io_error == 0)
|
|
|
|
zio->io_error = zio->io_child_error[c];
|
|
|
|
}
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
int
|
2017-03-21 04:36:00 +03:00
|
|
|
zio_bookmark_compare(const void *x1, const void *x2)
|
2016-10-14 03:59:18 +03:00
|
|
|
{
|
|
|
|
const zio_t *z1 = x1;
|
|
|
|
const zio_t *z2 = x2;
|
|
|
|
|
2017-03-21 04:36:00 +03:00
|
|
|
if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
|
|
|
|
return (1);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2017-03-21 04:36:00 +03:00
|
|
|
if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
|
|
|
|
return (1);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2017-03-21 04:36:00 +03:00
|
|
|
if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
if (z1 < z2)
|
|
|
|
return (-1);
|
|
|
|
if (z1 > z2)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
return (0);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Create the various types of I/O (read, write, free, etc)
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
|
2016-07-11 20:45:52 +03:00
|
|
|
void *private, zio_type_t type, zio_priority_t priority,
|
|
|
|
enum zio_flag flags, vdev_t *vd, uint64_t offset,
|
|
|
|
const zbookmark_phys_t *zb, enum zio_stage stage,
|
|
|
|
enum zio_stage pipeline)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
|
2016-07-11 20:45:52 +03:00
|
|
|
ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
|
|
|
|
|
|
|
|
ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
|
|
|
|
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
|
|
|
|
ASSERT(vd || stage == ZIO_STAGE_OPEN);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
|
2016-07-11 20:45:52 +03:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
|
2015-01-30 22:25:19 +03:00
|
|
|
bzero(zio, sizeof (zio_t));
|
|
|
|
|
Identify locks flagged by lockdep
When running a kernel with CONFIG_LOCKDEP=y, lockdep reports possible
recursive locking in some cases and possible circular locking dependency
in others, within the SPL and ZFS modules.
This patch uses a mutex type defined in SPL, MUTEX_NOLOCKDEP, to mark
such mutexes when they are initialized. This mutex type causes
attempts to take or release those locks to be wrapped in lockdep_off()
and lockdep_on() calls to silence the dependency checker and allow the
use of lock_stats to examine contention.
For RW locks, it uses an analogous lock type, RW_NOLOCKDEP.
The goal is that these locks are ultimately changed back to type
MUTEX_DEFAULT or RW_DEFAULT, after the locks are annotated to reflect
their relationship (e.g. z_name_lock below) or any real problem with the
lock dependencies are fixed.
Some of the affected locks are:
tc_open_lock:
=============
This is an array of locks, all with same name, which txg_quiesce must
take all of in order to move txg to next state. All default to the same
lockdep class, and so to lockdep appears recursive.
zp->z_name_lock:
================
In zfs_rmdir,
dzp = znode for the directory (input to zfs_dirent_lock)
zp = znode for the entry being removed (output of zfs_dirent_lock)
zfs_rmdir()->zfs_dirent_lock() takes z_name_lock in dzp
zfs_rmdir() takes z_name_lock in zp
Since both dzp and zp are type znode_t, the locks have the same default
class, and lockdep considers it a possible recursive lock attempt.
l->l_rwlock:
============
zap_expand_leaf() sometimes creates two new zap leaf structures, via
these call paths:
zap_deref_leaf()->zap_get_leaf_byblk()->zap_leaf_open()
zap_expand_leaf()->zap_create_leaf()->zap_expand_leaf()->zap_create_leaf()
Because both zap_leaf_open() and zap_create_leaf() initialize
l->l_rwlock in their (separate) leaf structures, the lockdep class is
the same, and the linux kernel believes these might both be the same
lock, and emits a possible recursive lock warning.
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3895
2015-10-15 23:08:27 +03:00
|
|
|
mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
|
2015-01-30 22:25:19 +03:00
|
|
|
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
|
|
|
|
list_create(&zio->io_parent_list, sizeof (zio_link_t),
|
|
|
|
offsetof(zio_link_t, zl_parent_node));
|
|
|
|
list_create(&zio->io_child_list, sizeof (zio_link_t),
|
|
|
|
offsetof(zio_link_t, zl_child_node));
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_init(&zio->io_alloc_list);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd != NULL)
|
|
|
|
zio->io_child_type = ZIO_CHILD_VDEV;
|
|
|
|
else if (flags & ZIO_FLAG_GANG_CHILD)
|
|
|
|
zio->io_child_type = ZIO_CHILD_GANG;
|
2010-05-29 00:45:14 +04:00
|
|
|
else if (flags & ZIO_FLAG_DDT_CHILD)
|
|
|
|
zio->io_child_type = ZIO_CHILD_DDT;
|
2008-12-03 23:09:06 +03:00
|
|
|
else
|
|
|
|
zio->io_child_type = ZIO_CHILD_LOGICAL;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (bp != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_bp = (blkptr_t *)bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_bp_copy = *bp;
|
|
|
|
zio->io_bp_orig = *bp;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (type != ZIO_TYPE_WRITE ||
|
|
|
|
zio->io_child_type == ZIO_CHILD_DDT)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_logical = zio;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
|
|
|
|
pipeline |= ZIO_GANG_STAGES;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio->io_spa = spa;
|
|
|
|
zio->io_txg = txg;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_done = done;
|
|
|
|
zio->io_private = private;
|
|
|
|
zio->io_type = type;
|
|
|
|
zio->io_priority = priority;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_vd = vd;
|
|
|
|
zio->io_offset = offset;
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_orig_abd = zio->io_abd = data;
|
2016-07-11 20:45:52 +03:00
|
|
|
zio->io_orig_size = zio->io_size = psize;
|
|
|
|
zio->io_lsize = lsize;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_orig_flags = zio->io_flags = flags;
|
|
|
|
zio->io_orig_stage = zio->io_stage = stage;
|
|
|
|
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zb != NULL)
|
|
|
|
zio->io_bookmark = *zb;
|
|
|
|
|
|
|
|
if (pio != NULL) {
|
2018-09-06 04:33:36 +03:00
|
|
|
if (zio->io_metaslab_class == NULL)
|
|
|
|
zio->io_metaslab_class = pio->io_metaslab_class;
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_logical == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_logical = pio->io_logical;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_GANG)
|
|
|
|
zio->io_gang_leader = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_add_child(pio, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-11-08 04:26:52 +04:00
|
|
|
taskq_init_ent(&zio->io_tqent);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_destroy(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_fini(&zio->io_alloc_list);
|
2015-01-30 22:25:19 +03:00
|
|
|
list_destroy(&zio->io_parent_list);
|
|
|
|
list_destroy(&zio->io_child_list);
|
|
|
|
mutex_destroy(&zio->io_lock);
|
|
|
|
cv_destroy(&zio->io_cv);
|
2008-12-03 23:09:06 +03:00
|
|
|
kmem_cache_free(zio_cache, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
|
2010-05-29 00:45:14 +04:00
|
|
|
void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
|
2009-02-18 23:51:31 +03:00
|
|
|
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
return (zio_null(NULL, spa, NULL, done, private, flags));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2020-02-11 01:00:05 +03:00
|
|
|
static int
|
|
|
|
zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
|
|
|
|
enum blk_verify_flag blk_verify, const char *fmt, ...)
|
|
|
|
{
|
|
|
|
va_list adx;
|
|
|
|
char buf[256];
|
|
|
|
|
|
|
|
va_start(adx, fmt);
|
|
|
|
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
|
|
|
|
va_end(adx);
|
|
|
|
|
|
|
|
switch (blk_verify) {
|
|
|
|
case BLK_VERIFY_HALT:
|
2020-03-05 02:08:41 +03:00
|
|
|
dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
|
2020-02-11 01:00:05 +03:00
|
|
|
zfs_panic_recover("%s: %s", spa_name(spa), buf);
|
|
|
|
break;
|
|
|
|
case BLK_VERIFY_LOG:
|
|
|
|
zfs_dbgmsg("%s: %s", spa_name(spa), buf);
|
|
|
|
break;
|
|
|
|
case BLK_VERIFY_ONLY:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify the block pointer fields contain reasonable values. This means
|
|
|
|
* it only contains known object types, checksum/compression identifiers,
|
|
|
|
* block sizes within the maximum allowed limits, valid DVAs, etc.
|
|
|
|
*
|
|
|
|
* If everything checks out B_TRUE is returned. The zfs_blkptr_verify
|
|
|
|
* argument controls the behavior when an invalid field is detected.
|
|
|
|
*
|
|
|
|
* Modes for zfs_blkptr_verify:
|
|
|
|
* 1) BLK_VERIFY_ONLY (evaluate the block)
|
|
|
|
* 2) BLK_VERIFY_LOG (evaluate the block and log problems)
|
|
|
|
* 3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
|
|
|
|
enum blk_verify_flag blk_verify)
|
2014-11-26 20:57:30 +03:00
|
|
|
{
|
2020-02-11 01:00:05 +03:00
|
|
|
int errors = 0;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p has invalid TYPE %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, (longlong_t)BP_GET_TYPE(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
|
|
|
|
BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p has invalid CHECKSUM %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, (longlong_t)BP_GET_CHECKSUM(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
|
|
|
|
BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p has invalid COMPRESS %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, (longlong_t)BP_GET_COMPRESS(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p has invalid LSIZE %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, (longlong_t)BP_GET_LSIZE(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p has invalid PSIZE %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, (longlong_t)BP_GET_PSIZE(bp));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BP_IS_EMBEDDED(bp)) {
|
2019-06-25 04:02:17 +03:00
|
|
|
if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p has invalid ETYPE %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, (longlong_t)BPE_GET_ETYPE(bp));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
/*
|
|
|
|
* Do not verify individual DVAs if the config is not trusted. This
|
|
|
|
* will be done once the zio is executed in vdev_mirror_map_alloc.
|
|
|
|
*/
|
|
|
|
if (!spa->spa_trust_config)
|
2020-02-11 01:00:05 +03:00
|
|
|
return (B_TRUE);
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
|
2019-08-14 06:24:43 +03:00
|
|
|
if (!config_held)
|
|
|
|
spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
|
|
|
|
else
|
|
|
|
ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
|
2014-11-26 20:57:30 +03:00
|
|
|
/*
|
|
|
|
* Pool-specific checks.
|
|
|
|
*
|
|
|
|
* Note: it would be nice to verify that the blk_birth and
|
|
|
|
* BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
|
|
|
|
* allows the birth time of log blocks (and dmu_sync()-ed blocks
|
|
|
|
* that are in the log) to be arbitrarily large.
|
|
|
|
*/
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
|
2021-03-26 21:19:35 +03:00
|
|
|
const dva_t *dva = &bp->blk_dva[i];
|
|
|
|
uint64_t vdevid = DVA_GET_VDEV(dva);
|
2017-11-04 23:25:13 +03:00
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
if (vdevid >= spa->spa_root_vdev->vdev_children) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p DVA %u has invalid VDEV %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, i, (longlong_t)vdevid);
|
2016-01-09 20:29:05 +03:00
|
|
|
continue;
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
2017-11-04 23:25:13 +03:00
|
|
|
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
|
2014-11-26 20:57:30 +03:00
|
|
|
if (vd == NULL) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p DVA %u has invalid VDEV %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, i, (longlong_t)vdevid);
|
2016-01-09 20:29:05 +03:00
|
|
|
continue;
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
|
|
|
if (vd->vdev_ops == &vdev_hole_ops) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p DVA %u has hole VDEV %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, i, (longlong_t)vdevid);
|
2016-01-09 20:29:05 +03:00
|
|
|
continue;
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
|
|
|
if (vd->vdev_ops == &vdev_missing_ops) {
|
|
|
|
/*
|
|
|
|
* "missing" vdevs are valid during import, but we
|
|
|
|
* don't have their detailed info (e.g. asize), so
|
|
|
|
* we can't perform any more checks on them.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
2021-03-26 21:19:35 +03:00
|
|
|
uint64_t offset = DVA_GET_OFFSET(dva);
|
|
|
|
uint64_t asize = DVA_GET_ASIZE(dva);
|
|
|
|
if (DVA_GET_GANG(dva))
|
|
|
|
asize = vdev_gang_header_asize(vd);
|
2014-11-26 20:57:30 +03:00
|
|
|
if (offset + asize > vd->vdev_asize) {
|
2020-02-11 01:00:05 +03:00
|
|
|
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
|
|
|
|
"blkptr at %p DVA %u has invalid OFFSET %llu",
|
2014-11-26 20:57:30 +03:00
|
|
|
bp, i, (longlong_t)offset);
|
|
|
|
}
|
|
|
|
}
|
2020-03-05 02:08:41 +03:00
|
|
|
if (errors > 0)
|
|
|
|
dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
|
2019-08-14 06:24:43 +03:00
|
|
|
if (!config_held)
|
|
|
|
spa_config_exit(spa, SCL_VDEV, bp);
|
2020-02-11 01:00:05 +03:00
|
|
|
|
|
|
|
return (errors == 0);
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
|
|
|
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
boolean_t
|
|
|
|
zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
|
|
|
|
{
|
|
|
|
uint64_t vdevid = DVA_GET_VDEV(dva);
|
|
|
|
|
|
|
|
if (vdevid >= spa->spa_root_vdev->vdev_children)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
|
|
|
|
if (vd == NULL)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
if (vd->vdev_ops == &vdev_hole_ops)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
if (vd->vdev_ops == &vdev_missing_ops) {
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t offset = DVA_GET_OFFSET(dva);
|
|
|
|
uint64_t asize = DVA_GET_ASIZE(dva);
|
|
|
|
|
2021-03-26 21:19:35 +03:00
|
|
|
if (DVA_GET_GANG(dva))
|
|
|
|
asize = vdev_gang_header_asize(vd);
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
if (offset + asize > vd->vdev_asize)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2020-02-11 01:00:05 +03:00
|
|
|
(void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
|
|
|
|
BLK_VERIFY_HALT);
|
2014-11-26 20:57:30 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
|
2016-07-11 20:45:52 +03:00
|
|
|
data, size, size, done, private,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
|
|
|
ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
|
2016-05-15 18:02:28 +03:00
|
|
|
zio_done_func_t *ready, zio_done_func_t *children_ready,
|
|
|
|
zio_done_func_t *physdone, zio_done_func_t *done,
|
|
|
|
void *private, zio_priority_t priority, enum zio_flag flags,
|
|
|
|
const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
|
|
|
|
zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
|
|
|
|
zp->zp_compress >= ZIO_COMPRESS_OFF &&
|
|
|
|
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
|
2012-12-14 03:24:15 +04:00
|
|
|
DMU_OT_IS_VALID(zp->zp_type) &&
|
2008-12-03 23:09:06 +03:00
|
|
|
zp->zp_level < 32 &&
|
2010-05-29 00:45:14 +04:00
|
|
|
zp->zp_copies > 0 &&
|
2013-05-10 23:47:54 +04:00
|
|
|
zp->zp_copies <= spa_max_replication(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
|
|
|
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio->io_ready = ready;
|
2016-05-15 18:02:28 +03:00
|
|
|
zio->io_children_ready = children_ready;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_physdone = physdone;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop = *zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* Data can be NULL if we are going to call zio_write_override() to
|
|
|
|
* provide the already-allocated BP. But we may need the data to
|
|
|
|
* verify a dedup hit (if requested). In this case, don't try to
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* dedup (just take the already-allocated BP verbatim). Encrypted
|
|
|
|
* dedup blocks need data as well so we also disable dedup in this
|
|
|
|
* case.
|
2014-06-06 01:19:08 +04:00
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (data == NULL &&
|
|
|
|
(zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
|
2014-06-06 01:19:08 +04:00
|
|
|
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
uint64_t size, zio_done_func_t *done, void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
|
2016-10-14 03:59:18 +03:00
|
|
|
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
|
|
|
|
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* We must reset the io_prop to match the values that existed
|
|
|
|
* when the bp was first written by dmu_sync() keeping in mind
|
|
|
|
* that nopwrite and dedup are mutually exclusive.
|
|
|
|
*/
|
|
|
|
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
|
|
|
|
zio->io_prop.zp_nopwrite = nopwrite;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_prop.zp_copies = copies;
|
|
|
|
zio->io_bp_override = bp;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
|
|
|
|
{
|
2014-06-06 01:19:08 +04:00
|
|
|
|
2020-02-11 01:00:05 +03:00
|
|
|
(void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* The check for EMBEDDED is a performance optimization. We
|
|
|
|
* process the free here (by ignoring it) rather than
|
|
|
|
* putting it on the list and then processing it in zio_free_sync().
|
|
|
|
*/
|
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return;
|
2013-09-04 16:00:57 +04:00
|
|
|
metaslab_check_free(spa, bp);
|
2013-07-03 20:13:38 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Frees that are for the currently-syncing txg, are not going to be
|
|
|
|
* deferred, and which will not need to do a read (i.e. not GANG or
|
|
|
|
* DEDUP), can be processed immediately. Otherwise, put them on the
|
|
|
|
* in-memory list for later processing.
|
Log Spacemap Project
= Motivation
At Delphix we've seen a lot of customer systems where fragmentation
is over 75% and random writes take a performance hit because a lot
of time is spend on I/Os that update on-disk space accounting metadata.
Specifically, we seen cases where 20% to 40% of sync time is spend
after sync pass 1 and ~30% of the I/Os on the system is spent updating
spacemaps.
The problem is that these pools have existed long enough that we've
touched almost every metaslab at least once, and random writes
scatter frees across all metaslabs every TXG, thus appending to
their spacemaps and resulting in many I/Os. To give an example,
assuming that every VDEV has 200 metaslabs and our writes fit within
a single spacemap block (generally 4K) we have 200 I/Os. Then if we
assume 2 levels of indirection, we need 400 additional I/Os and
since we are talking about metadata for which we keep 2 extra copies
for redundancy we need to triple that number, leading to a total of
1800 I/Os per VDEV every TXG.
We could try and decrease the number of metaslabs so we have less
I/Os per TXG but then each metaslab would cover a wider range on
disk and thus would take more time to be loaded in memory from disk.
In addition, after it's loaded, it's range tree would consume more
memory.
Another idea would be to just increase the spacemap block size
which would allow us to fit more entries within an I/O block
resulting in fewer I/Os per metaslab and a speedup in loading time.
The problem is still that we don't deal with the number of I/Os
going up as the number of metaslabs is increasing and the fact
is that we generally write a lot to a few metaslabs and a little
to the rest of them. Thus, just increasing the block size would
actually waste bandwidth because we won't be utilizing our bigger
block size.
= About this patch
This patch introduces the Log Spacemap project which provides the
solution to the above problem while taking into account all the
aforementioned tradeoffs. The details on how it achieves that can
be found in the references sections below and in the code (see
Big Theory Statement in spa_log_spacemap.c).
Even though the change is fairly constraint within the metaslab
and lower-level SPA codepaths, there is a side-change that is
user-facing. The change is that VDEV IDs from VDEV holes will no
longer be reused. To give some background and reasoning for this,
when a log device is removed and its VDEV structure was replaced
with a hole (or was compacted; if at the end of the vdev array),
its vdev_id could be reused by devices added after that. Now
with the pool-wide space maps recording the vdev ID, this behavior
can cause problems (e.g. is this entry referring to a segment in
the new vdev or the removed log?). Thus, to simplify things the
ID reuse behavior is gone and now vdev IDs for top-level vdevs
are truly unique within a pool.
= Testing
The illumos implementation of this feature has been used internally
for a year and has been in production for ~6 months. For this patch
specifically there don't seem to be any regressions introduced to
ZTS and I have been running zloop for a week without any related
problems.
= Performance Analysis (Linux Specific)
All performance results and analysis for illumos can be found in
the links of the references. Redoing the same experiments in Linux
gave similar results. Below are the specifics of the Linux run.
After the pool reached stable state the percentage of the time
spent in pass 1 per TXG was 64% on average for the stock bits
while the log spacemap bits stayed at 95% during the experiment
(graph: sdimitro.github.io/img/linux-lsm/PercOfSyncInPassOne.png).
Sync times per TXG were 37.6 seconds on average for the stock
bits and 22.7 seconds for the log spacemap bits (related graph:
sdimitro.github.io/img/linux-lsm/SyncTimePerTXG.png). As a result
the log spacemap bits were able to push more TXGs, which is also
the reason why all graphs quantified per TXG have more entries for
the log spacemap bits.
Another interesting aspect in terms of txg syncs is that the stock
bits had 22% of their TXGs reach sync pass 7, 55% reach sync pass 8,
and 20% reach 9. The log space map bits reached sync pass 4 in 79%
of their TXGs, sync pass 7 in 19%, and sync pass 8 at 1%. This
emphasizes the fact that not only we spend less time on metadata
but we also iterate less times to convergence in spa_sync() dirtying
objects.
[related graphs:
stock- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGStock.png
lsm- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGLSM.png]
Finally, the improvement in IOPs that the userland gains from the
change is approximately 40%. There is a consistent win in IOPS as
you can see from the graphs below but the absolute amount of
improvement that the log spacemap gives varies within each minute
interval.
sdimitro.github.io/img/linux-lsm/StockVsLog3Days.png
sdimitro.github.io/img/linux-lsm/StockVsLog10Hours.png
= Porting to Other Platforms
For people that want to port this commit to other platforms below
is a list of ZoL commits that this patch depends on:
Make zdb results for checkpoint tests consistent
db587941c5ff6dea01932bb78f70db63cf7f38ba
Update vdev_is_spacemap_addressable() for new spacemap encoding
419ba5914552c6185afbe1dd17b3ed4b0d526547
Simplify spa_sync by breaking it up to smaller functions
8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834
Factor metaslab_load_wait() in metaslab_load()
b194fab0fb6caad18711abccaff3c69ad8b3f6d3
Rename range_tree_verify to range_tree_verify_not_present
df72b8bebe0ebac0b20e0750984bad182cb6564a
Change target size of metaslabs from 256GB to 16GB
c853f382db731e15a87512f4ef1101d14d778a55
zdb -L should skip leak detection altogether
21e7cf5da89f55ce98ec1115726b150e19eefe89
vs_alloc can underflow in L2ARC vdevs
7558997d2f808368867ca7e5234e5793446e8f3f
Simplify log vdev removal code
6c926f426a26ffb6d7d8e563e33fc176164175cb
Get rid of space_map_update() for ms_synced_length
425d3237ee88abc53d8522a7139c926d278b4b7f
Introduce auxiliary metaslab histograms
928e8ad47d3478a3d5d01f0dd6ae74a9371af65e
Error path in metaslab_load_impl() forgets to drop ms_sync_lock
8eef997679ba54547f7d361553d21b3291f41ae7
= References
Background, Motivation, and Internals of the Feature
- OpenZFS 2017 Presentation:
youtu.be/jj2IxRkl5bQ
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemaps-project
Flushing Algorithm Internals & Performance Results
(Illumos Specific)
- Blogpost:
sdimitro.github.io/post/zfs-lsm-flushing/
- OpenZFS 2018 Presentation:
youtu.be/x6D2dHRjkxw
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemap-flushing-algorithm
Upstream Delphix Issues:
DLPX-51539, DLPX-59659, DLPX-57783, DLPX-61438, DLPX-41227, DLPX-59320
DLPX-63385
Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8442
2019-07-16 20:11:49 +03:00
|
|
|
*
|
|
|
|
* Note that we only defer frees after zfs_sync_pass_deferred_free
|
|
|
|
* when the log space map feature is disabled. [see relevant comment
|
|
|
|
* in spa_sync_iterate_to_convergence()]
|
2013-07-03 20:13:38 +04:00
|
|
|
*/
|
Log Spacemap Project
= Motivation
At Delphix we've seen a lot of customer systems where fragmentation
is over 75% and random writes take a performance hit because a lot
of time is spend on I/Os that update on-disk space accounting metadata.
Specifically, we seen cases where 20% to 40% of sync time is spend
after sync pass 1 and ~30% of the I/Os on the system is spent updating
spacemaps.
The problem is that these pools have existed long enough that we've
touched almost every metaslab at least once, and random writes
scatter frees across all metaslabs every TXG, thus appending to
their spacemaps and resulting in many I/Os. To give an example,
assuming that every VDEV has 200 metaslabs and our writes fit within
a single spacemap block (generally 4K) we have 200 I/Os. Then if we
assume 2 levels of indirection, we need 400 additional I/Os and
since we are talking about metadata for which we keep 2 extra copies
for redundancy we need to triple that number, leading to a total of
1800 I/Os per VDEV every TXG.
We could try and decrease the number of metaslabs so we have less
I/Os per TXG but then each metaslab would cover a wider range on
disk and thus would take more time to be loaded in memory from disk.
In addition, after it's loaded, it's range tree would consume more
memory.
Another idea would be to just increase the spacemap block size
which would allow us to fit more entries within an I/O block
resulting in fewer I/Os per metaslab and a speedup in loading time.
The problem is still that we don't deal with the number of I/Os
going up as the number of metaslabs is increasing and the fact
is that we generally write a lot to a few metaslabs and a little
to the rest of them. Thus, just increasing the block size would
actually waste bandwidth because we won't be utilizing our bigger
block size.
= About this patch
This patch introduces the Log Spacemap project which provides the
solution to the above problem while taking into account all the
aforementioned tradeoffs. The details on how it achieves that can
be found in the references sections below and in the code (see
Big Theory Statement in spa_log_spacemap.c).
Even though the change is fairly constraint within the metaslab
and lower-level SPA codepaths, there is a side-change that is
user-facing. The change is that VDEV IDs from VDEV holes will no
longer be reused. To give some background and reasoning for this,
when a log device is removed and its VDEV structure was replaced
with a hole (or was compacted; if at the end of the vdev array),
its vdev_id could be reused by devices added after that. Now
with the pool-wide space maps recording the vdev ID, this behavior
can cause problems (e.g. is this entry referring to a segment in
the new vdev or the removed log?). Thus, to simplify things the
ID reuse behavior is gone and now vdev IDs for top-level vdevs
are truly unique within a pool.
= Testing
The illumos implementation of this feature has been used internally
for a year and has been in production for ~6 months. For this patch
specifically there don't seem to be any regressions introduced to
ZTS and I have been running zloop for a week without any related
problems.
= Performance Analysis (Linux Specific)
All performance results and analysis for illumos can be found in
the links of the references. Redoing the same experiments in Linux
gave similar results. Below are the specifics of the Linux run.
After the pool reached stable state the percentage of the time
spent in pass 1 per TXG was 64% on average for the stock bits
while the log spacemap bits stayed at 95% during the experiment
(graph: sdimitro.github.io/img/linux-lsm/PercOfSyncInPassOne.png).
Sync times per TXG were 37.6 seconds on average for the stock
bits and 22.7 seconds for the log spacemap bits (related graph:
sdimitro.github.io/img/linux-lsm/SyncTimePerTXG.png). As a result
the log spacemap bits were able to push more TXGs, which is also
the reason why all graphs quantified per TXG have more entries for
the log spacemap bits.
Another interesting aspect in terms of txg syncs is that the stock
bits had 22% of their TXGs reach sync pass 7, 55% reach sync pass 8,
and 20% reach 9. The log space map bits reached sync pass 4 in 79%
of their TXGs, sync pass 7 in 19%, and sync pass 8 at 1%. This
emphasizes the fact that not only we spend less time on metadata
but we also iterate less times to convergence in spa_sync() dirtying
objects.
[related graphs:
stock- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGStock.png
lsm- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGLSM.png]
Finally, the improvement in IOPs that the userland gains from the
change is approximately 40%. There is a consistent win in IOPS as
you can see from the graphs below but the absolute amount of
improvement that the log spacemap gives varies within each minute
interval.
sdimitro.github.io/img/linux-lsm/StockVsLog3Days.png
sdimitro.github.io/img/linux-lsm/StockVsLog10Hours.png
= Porting to Other Platforms
For people that want to port this commit to other platforms below
is a list of ZoL commits that this patch depends on:
Make zdb results for checkpoint tests consistent
db587941c5ff6dea01932bb78f70db63cf7f38ba
Update vdev_is_spacemap_addressable() for new spacemap encoding
419ba5914552c6185afbe1dd17b3ed4b0d526547
Simplify spa_sync by breaking it up to smaller functions
8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834
Factor metaslab_load_wait() in metaslab_load()
b194fab0fb6caad18711abccaff3c69ad8b3f6d3
Rename range_tree_verify to range_tree_verify_not_present
df72b8bebe0ebac0b20e0750984bad182cb6564a
Change target size of metaslabs from 256GB to 16GB
c853f382db731e15a87512f4ef1101d14d778a55
zdb -L should skip leak detection altogether
21e7cf5da89f55ce98ec1115726b150e19eefe89
vs_alloc can underflow in L2ARC vdevs
7558997d2f808368867ca7e5234e5793446e8f3f
Simplify log vdev removal code
6c926f426a26ffb6d7d8e563e33fc176164175cb
Get rid of space_map_update() for ms_synced_length
425d3237ee88abc53d8522a7139c926d278b4b7f
Introduce auxiliary metaslab histograms
928e8ad47d3478a3d5d01f0dd6ae74a9371af65e
Error path in metaslab_load_impl() forgets to drop ms_sync_lock
8eef997679ba54547f7d361553d21b3291f41ae7
= References
Background, Motivation, and Internals of the Feature
- OpenZFS 2017 Presentation:
youtu.be/jj2IxRkl5bQ
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemaps-project
Flushing Algorithm Internals & Performance Results
(Illumos Specific)
- Blogpost:
sdimitro.github.io/post/zfs-lsm-flushing/
- OpenZFS 2018 Presentation:
youtu.be/x6D2dHRjkxw
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemap-flushing-algorithm
Upstream Delphix Issues:
DLPX-51539, DLPX-59659, DLPX-57783, DLPX-61438, DLPX-41227, DLPX-59320
DLPX-63385
Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8442
2019-07-16 20:11:49 +03:00
|
|
|
if (BP_IS_GANG(bp) ||
|
|
|
|
BP_GET_DEDUP(bp) ||
|
2013-07-03 20:13:38 +04:00
|
|
|
txg != spa->spa_syncing_txg ||
|
Log Spacemap Project
= Motivation
At Delphix we've seen a lot of customer systems where fragmentation
is over 75% and random writes take a performance hit because a lot
of time is spend on I/Os that update on-disk space accounting metadata.
Specifically, we seen cases where 20% to 40% of sync time is spend
after sync pass 1 and ~30% of the I/Os on the system is spent updating
spacemaps.
The problem is that these pools have existed long enough that we've
touched almost every metaslab at least once, and random writes
scatter frees across all metaslabs every TXG, thus appending to
their spacemaps and resulting in many I/Os. To give an example,
assuming that every VDEV has 200 metaslabs and our writes fit within
a single spacemap block (generally 4K) we have 200 I/Os. Then if we
assume 2 levels of indirection, we need 400 additional I/Os and
since we are talking about metadata for which we keep 2 extra copies
for redundancy we need to triple that number, leading to a total of
1800 I/Os per VDEV every TXG.
We could try and decrease the number of metaslabs so we have less
I/Os per TXG but then each metaslab would cover a wider range on
disk and thus would take more time to be loaded in memory from disk.
In addition, after it's loaded, it's range tree would consume more
memory.
Another idea would be to just increase the spacemap block size
which would allow us to fit more entries within an I/O block
resulting in fewer I/Os per metaslab and a speedup in loading time.
The problem is still that we don't deal with the number of I/Os
going up as the number of metaslabs is increasing and the fact
is that we generally write a lot to a few metaslabs and a little
to the rest of them. Thus, just increasing the block size would
actually waste bandwidth because we won't be utilizing our bigger
block size.
= About this patch
This patch introduces the Log Spacemap project which provides the
solution to the above problem while taking into account all the
aforementioned tradeoffs. The details on how it achieves that can
be found in the references sections below and in the code (see
Big Theory Statement in spa_log_spacemap.c).
Even though the change is fairly constraint within the metaslab
and lower-level SPA codepaths, there is a side-change that is
user-facing. The change is that VDEV IDs from VDEV holes will no
longer be reused. To give some background and reasoning for this,
when a log device is removed and its VDEV structure was replaced
with a hole (or was compacted; if at the end of the vdev array),
its vdev_id could be reused by devices added after that. Now
with the pool-wide space maps recording the vdev ID, this behavior
can cause problems (e.g. is this entry referring to a segment in
the new vdev or the removed log?). Thus, to simplify things the
ID reuse behavior is gone and now vdev IDs for top-level vdevs
are truly unique within a pool.
= Testing
The illumos implementation of this feature has been used internally
for a year and has been in production for ~6 months. For this patch
specifically there don't seem to be any regressions introduced to
ZTS and I have been running zloop for a week without any related
problems.
= Performance Analysis (Linux Specific)
All performance results and analysis for illumos can be found in
the links of the references. Redoing the same experiments in Linux
gave similar results. Below are the specifics of the Linux run.
After the pool reached stable state the percentage of the time
spent in pass 1 per TXG was 64% on average for the stock bits
while the log spacemap bits stayed at 95% during the experiment
(graph: sdimitro.github.io/img/linux-lsm/PercOfSyncInPassOne.png).
Sync times per TXG were 37.6 seconds on average for the stock
bits and 22.7 seconds for the log spacemap bits (related graph:
sdimitro.github.io/img/linux-lsm/SyncTimePerTXG.png). As a result
the log spacemap bits were able to push more TXGs, which is also
the reason why all graphs quantified per TXG have more entries for
the log spacemap bits.
Another interesting aspect in terms of txg syncs is that the stock
bits had 22% of their TXGs reach sync pass 7, 55% reach sync pass 8,
and 20% reach 9. The log space map bits reached sync pass 4 in 79%
of their TXGs, sync pass 7 in 19%, and sync pass 8 at 1%. This
emphasizes the fact that not only we spend less time on metadata
but we also iterate less times to convergence in spa_sync() dirtying
objects.
[related graphs:
stock- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGStock.png
lsm- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGLSM.png]
Finally, the improvement in IOPs that the userland gains from the
change is approximately 40%. There is a consistent win in IOPS as
you can see from the graphs below but the absolute amount of
improvement that the log spacemap gives varies within each minute
interval.
sdimitro.github.io/img/linux-lsm/StockVsLog3Days.png
sdimitro.github.io/img/linux-lsm/StockVsLog10Hours.png
= Porting to Other Platforms
For people that want to port this commit to other platforms below
is a list of ZoL commits that this patch depends on:
Make zdb results for checkpoint tests consistent
db587941c5ff6dea01932bb78f70db63cf7f38ba
Update vdev_is_spacemap_addressable() for new spacemap encoding
419ba5914552c6185afbe1dd17b3ed4b0d526547
Simplify spa_sync by breaking it up to smaller functions
8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834
Factor metaslab_load_wait() in metaslab_load()
b194fab0fb6caad18711abccaff3c69ad8b3f6d3
Rename range_tree_verify to range_tree_verify_not_present
df72b8bebe0ebac0b20e0750984bad182cb6564a
Change target size of metaslabs from 256GB to 16GB
c853f382db731e15a87512f4ef1101d14d778a55
zdb -L should skip leak detection altogether
21e7cf5da89f55ce98ec1115726b150e19eefe89
vs_alloc can underflow in L2ARC vdevs
7558997d2f808368867ca7e5234e5793446e8f3f
Simplify log vdev removal code
6c926f426a26ffb6d7d8e563e33fc176164175cb
Get rid of space_map_update() for ms_synced_length
425d3237ee88abc53d8522a7139c926d278b4b7f
Introduce auxiliary metaslab histograms
928e8ad47d3478a3d5d01f0dd6ae74a9371af65e
Error path in metaslab_load_impl() forgets to drop ms_sync_lock
8eef997679ba54547f7d361553d21b3291f41ae7
= References
Background, Motivation, and Internals of the Feature
- OpenZFS 2017 Presentation:
youtu.be/jj2IxRkl5bQ
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemaps-project
Flushing Algorithm Internals & Performance Results
(Illumos Specific)
- Blogpost:
sdimitro.github.io/post/zfs-lsm-flushing/
- OpenZFS 2018 Presentation:
youtu.be/x6D2dHRjkxw
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemap-flushing-algorithm
Upstream Delphix Issues:
DLPX-51539, DLPX-59659, DLPX-57783, DLPX-61438, DLPX-41227, DLPX-59320
DLPX-63385
Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8442
2019-07-16 20:11:49 +03:00
|
|
|
(spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
|
|
|
|
!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
|
2013-07-03 20:13:38 +04:00
|
|
|
bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
|
|
|
|
} else {
|
2020-02-29 01:49:44 +03:00
|
|
|
VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
|
2013-07-03 20:13:38 +04:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2020-02-29 01:49:44 +03:00
|
|
|
/*
|
|
|
|
* To improve performance, this function may return NULL if we were able
|
|
|
|
* to do the free immediately. This avoids the cost of creating a zio
|
|
|
|
* (and linking it to the parent, etc).
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|
|
|
enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(!BP_IS_HOLE(bp));
|
|
|
|
ASSERT(spa_syncing_txg(spa) == txg);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp))
|
2020-02-29 01:49:44 +03:00
|
|
|
return (NULL);
|
2014-06-06 01:19:08 +04:00
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
metaslab_check_free(spa, bp);
|
2013-10-07 15:30:22 +04:00
|
|
|
arc_freed(spa, bp);
|
2017-11-16 04:27:01 +03:00
|
|
|
dsl_scan_freed(spa, bp);
|
2013-09-04 16:00:57 +04:00
|
|
|
|
2020-02-29 01:49:44 +03:00
|
|
|
if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
|
|
|
|
/*
|
|
|
|
* GANG and DEDUP blocks can induce a read (for the gang block
|
|
|
|
* header, or the DDT), so issue them asynchronously so that
|
|
|
|
* this thread is not tied up.
|
|
|
|
*/
|
|
|
|
enum zio_stage stage =
|
|
|
|
ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
|
2013-07-03 20:13:38 +04:00
|
|
|
|
2020-02-29 01:49:44 +03:00
|
|
|
return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
|
|
|
|
BP_GET_PSIZE(bp), NULL, NULL,
|
|
|
|
ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
|
|
|
|
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
|
|
|
|
} else {
|
|
|
|
metaslab_free(spa, bp, txg, B_FALSE);
|
|
|
|
return (NULL);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|
|
|
zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2020-02-11 01:00:05 +03:00
|
|
|
(void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
|
|
|
|
BLK_VERIFY_HALT);
|
2014-06-06 01:19:08 +04:00
|
|
|
|
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* A claim is an allocation of a specific block. Claims are needed
|
|
|
|
* to support immediate writes in the intent log. The issue is that
|
|
|
|
* immediate writes contain committed data, but in a txg that was
|
|
|
|
* *not* committed. Upon opening the pool after an unclean shutdown,
|
|
|
|
* the intent log claims all blocks that contain immediate write data
|
|
|
|
* so that the SPA knows they're in use.
|
|
|
|
*
|
|
|
|
* All claims *must* be resolved in the first txg -- before the SPA
|
|
|
|
* starts allocating blocks -- so that nothing is allocated twice.
|
2010-05-29 00:45:14 +04:00
|
|
|
* If txg == 0 we just verify that the block is claimable.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2016-12-17 01:11:29 +03:00
|
|
|
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
|
|
|
|
spa_min_claim_txg(spa));
|
|
|
|
ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
|
2020-10-30 18:55:59 +03:00
|
|
|
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
|
2016-07-11 20:45:52 +03:00
|
|
|
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
|
|
|
|
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT0(zio->io_queued_timestamp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
int c;
|
|
|
|
|
|
|
|
if (vd->vdev_children == 0) {
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
|
|
|
|
|
|
|
|
zio->io_cmd = cmd;
|
|
|
|
} else {
|
2009-02-18 23:51:31 +03:00
|
|
|
zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
for (c = 0; c < vd->vdev_children; c++)
|
|
|
|
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
done, private, flags));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
zio_t *
|
|
|
|
zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
|
|
|
zio_done_func_t *done, void *private, zio_priority_t priority,
|
|
|
|
enum zio_flag flags, enum trim_flag trim_flags)
|
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
ASSERT0(vd->vdev_children);
|
|
|
|
ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
|
|
|
|
ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
|
|
|
|
ASSERT3U(size, !=, 0);
|
|
|
|
|
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
|
|
|
|
private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
|
|
|
|
vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
|
|
|
|
zio->io_trim_flags = trim_flags;
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *
|
|
|
|
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, int checksum, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_children == 0);
|
|
|
|
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
|
|
|
|
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
|
|
|
|
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
|
|
|
|
private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
|
|
|
|
offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop.zp_checksum = checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, int checksum, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_children == 0);
|
|
|
|
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
|
|
|
|
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
|
|
|
|
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
|
|
|
|
private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
|
|
|
|
offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop.zp_checksum = checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-06-16 01:47:05 +03:00
|
|
|
if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* zec checksums are necessarily destructive -- they modify
|
2008-12-03 23:09:06 +03:00
|
|
|
* the end of the write buffer to hold the verifier/checksum.
|
2008-11-20 23:01:55 +03:00
|
|
|
* Therefore, we must make a local copy in case the data is
|
2008-12-03 23:09:06 +03:00
|
|
|
* being written to multiple places in parallel.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *wbuf = abd_alloc_sametype(data, size);
|
|
|
|
abd_copy(wbuf, data, size);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio, wbuf, size, size, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Create a child I/O to do some work for us.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
2017-01-21 00:17:55 +03:00
|
|
|
abd_t *data, uint64_t size, int type, zio_priority_t priority,
|
|
|
|
enum zio_flag flags, zio_done_func_t *done, void *private)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
/*
|
|
|
|
* vdev child I/Os do not propagate their error to the parent.
|
|
|
|
* Therefore, for correct operation the caller *must* check for
|
|
|
|
* and handle the error in the child i/o's done callback.
|
|
|
|
* The only exceptions are i/os that we don't care about
|
|
|
|
* (OPTIONAL or REPAIR).
|
|
|
|
*/
|
|
|
|
ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
|
|
|
|
done != NULL);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (type == ZIO_TYPE_READ && bp != NULL) {
|
|
|
|
/*
|
|
|
|
* If we have the bp, then the child should perform the
|
|
|
|
* checksum and the parent need not. This pushes error
|
|
|
|
* detection as close to the leaves as possible and
|
|
|
|
* eliminates redundant checksums in the interior nodes.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
|
|
|
|
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
if (vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
ASSERT0(vd->vdev_children);
|
2008-12-03 23:09:06 +03:00
|
|
|
offset += VDEV_LABEL_START_SIZE;
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
flags |= ZIO_VDEV_CHILD_FLAGS(pio);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we've decided to do a repair, the write is not speculative --
|
|
|
|
* even if the original read was.
|
|
|
|
*/
|
|
|
|
if (flags & ZIO_FLAG_IO_REPAIR)
|
|
|
|
flags &= ~ZIO_FLAG_SPECULATIVE;
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* If we're creating a child I/O that is not associated with a
|
|
|
|
* top-level vdev, then the child zio is not an allocating I/O.
|
|
|
|
* If this is a retried I/O then we ignore it since we will
|
|
|
|
* have already processed the original allocating I/O.
|
|
|
|
*/
|
|
|
|
if (flags & ZIO_FLAG_IO_ALLOCATING &&
|
|
|
|
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
|
2018-09-06 04:33:36 +03:00
|
|
|
ASSERT(pio->io_metaslab_class != NULL);
|
|
|
|
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT(type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
|
|
|
|
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
|
|
|
|
pio->io_child_type == ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
flags &= ~ZIO_FLAG_IO_ALLOCATING;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
|
2010-05-29 00:45:14 +04:00
|
|
|
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
|
|
|
|
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_physdone = pio->io_physdone;
|
|
|
|
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
|
|
|
|
zio->io_logical->io_phys_children++;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
zio_type_t type, zio_priority_t priority, enum zio_flag flags,
|
2017-01-12 20:42:11 +03:00
|
|
|
zio_done_func_t *done, void *private)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
|
2016-07-11 20:45:52 +03:00
|
|
|
data, size, size, done, private, type, priority,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
|
2008-12-03 23:09:06 +03:00
|
|
|
vd, offset, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_flush(zio_t *zio, vdev_t *vd)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
NULL, NULL,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
|
|
|
zio_shrink(zio_t *zio, uint64_t size)
|
|
|
|
{
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3P(zio->io_executor, ==, NULL);
|
|
|
|
ASSERT3U(zio->io_orig_size, ==, zio->io_size);
|
|
|
|
ASSERT3U(size, <=, zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't shrink for raidz because of problems with the
|
|
|
|
* reconstruction when reading back less than the block size.
|
|
|
|
* Note, BP_IS_RAIDZ() assumes no compression.
|
|
|
|
*/
|
|
|
|
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
|
2016-07-11 20:45:52 +03:00
|
|
|
if (!BP_IS_RAIDZ(zio->io_bp)) {
|
|
|
|
/* we are not doing a raw write */
|
|
|
|
ASSERT3U(zio->io_size, ==, zio->io_lsize);
|
|
|
|
zio->io_orig_size = zio->io_size = zio->io_lsize = size;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Prepare to read and write logical blocks
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_read_bp_init(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
uint64_t psize =
|
|
|
|
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_child_type == ZIO_CHILD_LOGICAL &&
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
|
|
|
|
psize, psize, zio_decompress);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
|
|
|
|
BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
|
|
|
|
zio->io_child_type == ZIO_CHILD_LOGICAL) {
|
|
|
|
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
|
|
|
|
psize, psize, zio_decrypt);
|
|
|
|
}
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
|
2016-07-22 18:52:49 +03:00
|
|
|
int psize = BPE_GET_PSIZE(bp);
|
|
|
|
void *data = abd_borrow_buf(zio->io_abd, psize);
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2016-07-22 18:52:49 +03:00
|
|
|
decode_embedded_bp_compressed(bp, data);
|
|
|
|
abd_return_buf_copy(zio->io_abd, data, psize);
|
2014-06-06 01:19:08 +04:00
|
|
|
} else {
|
|
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
2014-06-06 01:19:08 +04:00
|
|
|
}
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
|
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
|
|
|
|
|
|
|
|
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
|
|
|
|
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_bp_init(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (!IO_IS_ALLOCATING(zio))
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
|
|
|
|
|
|
|
|
if (zio->io_bp_override) {
|
2016-10-14 03:59:18 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(bp->blk_birth != zio->io_txg);
|
|
|
|
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
|
|
|
|
|
|
|
|
*bp = *zio->io_bp_override;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp))
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2014-06-06 01:19:08 +04:00
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* If we've been overridden and nopwrite is set then
|
|
|
|
* set the flag accordingly to indicate that a nopwrite
|
|
|
|
* has already occurred.
|
|
|
|
*/
|
|
|
|
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
|
|
|
|
ASSERT(!zp->zp_dedup);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
|
2013-05-10 23:47:54 +04:00
|
|
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2013-05-10 23:47:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(!zp->zp_nopwrite);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2016-06-16 01:47:05 +03:00
|
|
|
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
|
|
|
|
!zp->zp_encrypt) {
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(bp, 1);
|
|
|
|
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We were unable to handle this as an override bp, treat
|
|
|
|
* it as a regular write I/O.
|
|
|
|
*/
|
2015-11-04 23:19:17 +03:00
|
|
|
zio->io_bp_override = NULL;
|
2016-10-14 03:59:18 +03:00
|
|
|
*bp = zio->io_bp_orig;
|
|
|
|
zio->io_pipeline = zio->io_orig_pipeline;
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_write_compress(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
enum zio_compress compress = zp->zp_compress;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t lsize = zio->io_lsize;
|
|
|
|
uint64_t psize = zio->io_size;
|
|
|
|
int pass = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If our children haven't all reached the ready stage,
|
|
|
|
* wait for them and then repeat this pipeline stage.
|
|
|
|
*/
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
|
|
|
|
ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
if (!IO_IS_ALLOCATING(zio))
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
if (zio->io_children_ready != NULL) {
|
|
|
|
/*
|
|
|
|
* Now that all our children are ready, run the callback
|
|
|
|
* associated with this zio in case it wants to modify the
|
|
|
|
* data to be written.
|
|
|
|
*/
|
|
|
|
ASSERT3U(zp->zp_level, >, 0);
|
|
|
|
zio->io_children_ready(zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
|
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We're rewriting an existing block, which means we're
|
|
|
|
* working on behalf of spa_sync(). For spa_sync() to
|
|
|
|
* converge, it must eventually be the case that we don't
|
|
|
|
* have to allocate new blocks. But compression changes
|
|
|
|
* the blocksize, which forces a reallocate, and makes
|
|
|
|
* convergence take longer. Therefore, after the first
|
|
|
|
* few passes, stop compressing to ensure convergence.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
pass = spa_sync_pass(spa);
|
|
|
|
|
|
|
|
ASSERT(zio->io_txg == spa_syncing_txg(spa));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!BP_GET_DEDUP(bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-05-06 21:14:52 +04:00
|
|
|
if (pass >= zfs_sync_pass_dont_compress)
|
2008-12-03 23:09:06 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* Make sure someone doesn't change their mind on overwrites */
|
2014-06-06 01:19:08 +04:00
|
|
|
ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
/* If it's a compressed write that is not raw, compress the buffer. */
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (compress != ZIO_COMPRESS_OFF &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
void *cbuf = zio_buf_alloc(lsize);
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 20:10:17 +03:00
|
|
|
psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize,
|
|
|
|
zp->zp_complevel);
|
|
|
|
if (psize == 0 || psize >= lsize) {
|
2008-12-03 23:09:06 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_buf_free(cbuf, lsize);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
} else if (!zp->zp_dedup && !zp->zp_encrypt &&
|
|
|
|
psize <= BPE_PAYLOAD_SIZE &&
|
2014-06-06 01:19:08 +04:00
|
|
|
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
|
|
|
|
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
|
|
|
|
encode_embedded_bp_compressed(bp,
|
|
|
|
cbuf, compress, lsize, psize);
|
|
|
|
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
|
|
|
|
BP_SET_TYPE(bp, zio->io_prop.zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
|
|
|
|
zio_buf_free(cbuf, lsize);
|
|
|
|
bp->blk_birth = zio->io_txg;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
ASSERT(spa_feature_is_active(spa,
|
|
|
|
SPA_FEATURE_EMBEDDED_DATA));
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
* Round compressed size up to the minimum allocation
|
|
|
|
* size of the smallest-ashift device, and zero the
|
|
|
|
* tail. This ensures that the compressed size of the
|
|
|
|
* BP (and thus compressratio property) are correct,
|
2015-05-20 07:14:01 +03:00
|
|
|
* in that we charge for the padding used to fill out
|
|
|
|
* the last sector.
|
2014-06-06 01:19:08 +04:00
|
|
|
*/
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
|
|
|
|
size_t rounded = (size_t)roundup(psize,
|
|
|
|
spa->spa_min_alloc);
|
2015-05-20 07:14:01 +03:00
|
|
|
if (rounded >= lsize) {
|
2014-06-06 01:19:08 +04:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
|
|
|
zio_buf_free(cbuf, lsize);
|
2015-05-20 07:14:01 +03:00
|
|
|
psize = lsize;
|
2014-06-06 01:19:08 +04:00
|
|
|
} else {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *cdata = abd_get_from_buf(cbuf, lsize);
|
|
|
|
abd_take_ownership_of_buf(cdata, B_TRUE);
|
|
|
|
abd_zero_off(cdata, psize, rounded - psize);
|
2015-05-20 07:14:01 +03:00
|
|
|
psize = rounded;
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(zio, cdata,
|
2014-06-06 01:19:08 +04:00
|
|
|
psize, lsize, NULL);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We were unable to handle this as an override bp, treat
|
|
|
|
* it as a regular write I/O.
|
|
|
|
*/
|
|
|
|
zio->io_bp_override = NULL;
|
|
|
|
*bp = zio->io_bp_orig;
|
|
|
|
zio->io_pipeline = zio->io_orig_pipeline;
|
|
|
|
|
2018-02-21 23:28:52 +03:00
|
|
|
} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
|
|
|
|
zp->zp_type == DMU_OT_DNODE) {
|
|
|
|
/*
|
|
|
|
* The DMU actually relies on the zio layer's compression
|
|
|
|
* to free metadnode blocks that have had all contained
|
|
|
|
* dnodes freed. As a result, even when doing a raw
|
|
|
|
* receive, we must check whether the block can be compressed
|
|
|
|
* to a hole.
|
|
|
|
*/
|
|
|
|
psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 20:10:17 +03:00
|
|
|
zio->io_abd, NULL, lsize, zp->zp_complevel);
|
|
|
|
if (psize == 0 || psize >= lsize)
|
2018-02-21 23:28:52 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2016-07-11 20:45:52 +03:00
|
|
|
} else {
|
|
|
|
ASSERT3U(psize, !=, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* The final pass of spa_sync() must be all rewrites, but the first
|
|
|
|
* few passes offer a trade-off: allocating blocks defers convergence,
|
|
|
|
* but newly allocated blocks are sequential, so they can be written
|
|
|
|
* to disk faster. Therefore, we allow the first few passes of
|
|
|
|
* spa_sync() to allocate new blocks, but force rewrites after that.
|
|
|
|
* There should only be a handful of blocks after pass 1 in any case.
|
|
|
|
*/
|
2013-12-09 22:37:51 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
|
|
|
|
BP_GET_PSIZE(bp) == psize &&
|
2013-05-06 21:14:52 +04:00
|
|
|
pass >= zfs_sync_pass_rewrite) {
|
2018-09-06 04:33:36 +03:00
|
|
|
VERIFY3U(psize, !=, 0);
|
2017-11-04 23:25:13 +03:00
|
|
|
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
|
2018-09-06 04:33:36 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
|
|
|
|
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
|
|
|
|
} else {
|
|
|
|
BP_ZERO(bp);
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (psize == 0) {
|
2013-12-09 22:37:51 +04:00
|
|
|
if (zio->io_bp_orig.blk_birth != 0 &&
|
|
|
|
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
|
|
|
BP_SET_TYPE(bp, zp->zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zp->zp_level);
|
|
|
|
BP_SET_BIRTH(bp, zio->io_txg, 0);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
} else {
|
|
|
|
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
2013-12-09 22:37:51 +04:00
|
|
|
BP_SET_TYPE(bp, zp->zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zp->zp_level);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_PSIZE(bp, psize);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_COMPRESS(bp, compress);
|
|
|
|
BP_SET_CHECKSUM(bp, zp->zp_checksum);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(bp, zp->zp_dedup);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->zp_dedup) {
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
ASSERT(!zp->zp_encrypt ||
|
|
|
|
DMU_OT_IS_ENCRYPTED(zp->zp_type));
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
|
|
|
|
}
|
2013-05-10 23:47:54 +04:00
|
|
|
if (zp->zp_nopwrite) {
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_bp_init(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
|
|
|
|
if (BP_GET_DEDUP(bp))
|
|
|
|
zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Execute the I/O pipeline
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void
|
2013-05-06 23:24:30 +04:00
|
|
|
zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *spa = zio->io_spa;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_type_t t = zio->io_type;
|
2011-11-08 04:26:52 +04:00
|
|
|
int flags = (cutinline ? TQ_FRONT : 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* If we're a config writer or a probe, the normal issue and
|
|
|
|
* interrupt threads may all be blocked waiting for the config lock.
|
|
|
|
* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
|
2008-12-03 23:09:06 +03:00
|
|
|
t = ZIO_TYPE_NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
|
|
|
|
t = ZIO_TYPE_NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
2013-05-06 23:24:30 +04:00
|
|
|
* If this is a high priority I/O, then use the high priority taskq if
|
|
|
|
* available.
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
2018-11-03 02:49:40 +03:00
|
|
|
if ((zio->io_priority == ZIO_PRIORITY_NOW ||
|
|
|
|
zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
|
2013-05-06 23:24:30 +04:00
|
|
|
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
q++;
|
|
|
|
|
|
|
|
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
|
2010-08-26 21:32:23 +04:00
|
|
|
|
2011-11-08 04:26:52 +04:00
|
|
|
/*
|
|
|
|
* NB: We are assuming that the zio can only be dispatched
|
|
|
|
* to a single taskq at a time. It would be a grievous error
|
|
|
|
* to dispatch the zio to another taskq at the same time.
|
|
|
|
*/
|
|
|
|
ASSERT(taskq_empty_ent(&zio->io_tqent));
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
|
|
|
|
flags, &zio->io_tqent);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static boolean_t
|
2013-05-06 23:24:30 +04:00
|
|
|
zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2020-03-03 21:29:38 +03:00
|
|
|
taskq_t *tq = taskq_of_curthread();
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
|
|
|
|
uint_t i;
|
|
|
|
for (i = 0; i < tqs->stqs_count; i++) {
|
2020-03-03 21:29:38 +03:00
|
|
|
if (tqs->stqs_taskq[i] == tq)
|
2013-05-06 23:24:30 +04:00
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (B_FALSE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_issue_async(zio_t *zio)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_interrupt(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2020-01-22 02:04:13 +03:00
|
|
|
void
|
|
|
|
zio_delay_interrupt(zio_t *zio)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The timeout_generic() function isn't defined in userspace, so
|
|
|
|
* rather than trying to implement the function, the zio delay
|
|
|
|
* functionality has been disabled for userspace builds.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
/*
|
|
|
|
* If io_target_timestamp is zero, then no delay has been registered
|
|
|
|
* for this IO, thus jump to the end of this function and "skip" the
|
|
|
|
* delay; issuing it directly to the zio layer.
|
|
|
|
*/
|
|
|
|
if (zio->io_target_timestamp != 0) {
|
|
|
|
hrtime_t now = gethrtime();
|
|
|
|
|
|
|
|
if (now >= zio->io_target_timestamp) {
|
|
|
|
/*
|
|
|
|
* This IO has already taken longer than the target
|
|
|
|
* delay to complete, so we don't want to delay it
|
|
|
|
* any longer; we "miss" the delay and issue it
|
|
|
|
* directly to the zio layer. This is likely due to
|
|
|
|
* the target latency being set to a value less than
|
|
|
|
* the underlying hardware can satisfy (e.g. delay
|
|
|
|
* set to 1ms, but the disks take 10ms to complete an
|
|
|
|
* IO request).
|
|
|
|
*/
|
|
|
|
|
|
|
|
DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
|
|
|
|
hrtime_t, now);
|
|
|
|
|
|
|
|
zio_interrupt(zio);
|
|
|
|
} else {
|
|
|
|
taskqid_t tid;
|
|
|
|
hrtime_t diff = zio->io_target_timestamp - now;
|
|
|
|
clock_t expire_at_tick = ddi_get_lbolt() +
|
|
|
|
NSEC_TO_TICK(diff);
|
|
|
|
|
|
|
|
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
|
|
|
|
hrtime_t, now, hrtime_t, diff);
|
|
|
|
|
|
|
|
if (NSEC_TO_TICK(diff) == 0) {
|
|
|
|
/* Our delay is less than a jiffy - just spin */
|
|
|
|
zfs_sleep_until(zio->io_target_timestamp);
|
|
|
|
zio_interrupt(zio);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Use taskq_dispatch_delay() in the place of
|
|
|
|
* OpenZFS's timeout_generic().
|
|
|
|
*/
|
|
|
|
tid = taskq_dispatch_delay(system_taskq,
|
|
|
|
(task_func_t *)zio_interrupt,
|
|
|
|
zio, TQ_NOSLEEP, expire_at_tick);
|
|
|
|
if (tid == TASKQID_INVALID) {
|
|
|
|
/*
|
|
|
|
* Couldn't allocate a task. Just
|
|
|
|
* finish the zio without a delay.
|
|
|
|
*/
|
|
|
|
zio_interrupt(zio);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
|
|
|
|
zio_interrupt(zio);
|
|
|
|
}
|
|
|
|
|
2017-12-19 01:06:07 +03:00
|
|
|
static void
|
2019-02-15 23:44:24 +03:00
|
|
|
zio_deadman_impl(zio_t *pio, int ziodepth)
|
2017-12-19 01:06:07 +03:00
|
|
|
{
|
|
|
|
zio_t *cio, *cio_next;
|
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
vdev_t *vd = pio->io_vd;
|
|
|
|
|
2019-02-15 23:44:24 +03:00
|
|
|
if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
|
|
|
|
vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
|
2017-12-19 01:06:07 +03:00
|
|
|
zbookmark_phys_t *zb = &pio->io_bookmark;
|
|
|
|
uint64_t delta = gethrtime() - pio->io_timestamp;
|
|
|
|
uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
|
|
|
|
|
2019-04-05 04:57:06 +03:00
|
|
|
zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
|
2017-12-19 01:06:07 +03:00
|
|
|
"delta=%llu queued=%llu io=%llu "
|
|
|
|
"path=%s last=%llu "
|
|
|
|
"type=%d priority=%d flags=0x%x "
|
|
|
|
"stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
|
|
|
|
"objset=%llu object=%llu level=%llu blkid=%llu "
|
|
|
|
"offset=%llu size=%llu error=%d",
|
2019-02-15 23:44:24 +03:00
|
|
|
ziodepth, pio, pio->io_timestamp,
|
2017-12-19 01:06:07 +03:00
|
|
|
delta, pio->io_delta, pio->io_delay,
|
2019-02-15 23:44:24 +03:00
|
|
|
vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0,
|
2017-12-19 01:06:07 +03:00
|
|
|
pio->io_type, pio->io_priority, pio->io_flags,
|
2019-02-15 23:44:24 +03:00
|
|
|
pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
|
2017-12-19 01:06:07 +03:00
|
|
|
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
|
|
|
|
pio->io_offset, pio->io_size, pio->io_error);
|
2020-09-01 05:35:11 +03:00
|
|
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
|
2020-09-04 20:34:28 +03:00
|
|
|
pio->io_spa, vd, zb, pio, 0);
|
2017-12-19 01:06:07 +03:00
|
|
|
|
|
|
|
if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
|
|
|
|
taskq_empty_ent(&pio->io_tqent)) {
|
|
|
|
zio_interrupt(pio);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio, &zl);
|
2019-02-15 23:44:24 +03:00
|
|
|
zio_deadman_impl(cio, ziodepth + 1);
|
2017-12-19 01:06:07 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Log the critical information describing this zio and all of its children
|
|
|
|
* using the zfs_dbgmsg() interface then post deadman event for the ZED.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_deadman(zio_t *pio, char *tag)
|
|
|
|
{
|
|
|
|
spa_t *spa = pio->io_spa;
|
|
|
|
char *name = spa_name(spa);
|
|
|
|
|
|
|
|
if (!zfs_deadman_enabled || spa_suspended(spa))
|
|
|
|
return;
|
|
|
|
|
2019-02-15 23:44:24 +03:00
|
|
|
zio_deadman_impl(pio, 0);
|
2017-12-19 01:06:07 +03:00
|
|
|
|
|
|
|
switch (spa_get_deadman_failmode(spa)) {
|
|
|
|
case ZIO_FAILURE_MODE_WAIT:
|
|
|
|
zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ZIO_FAILURE_MODE_CONTINUE:
|
|
|
|
zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ZIO_FAILURE_MODE_PANIC:
|
|
|
|
fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Execute the I/O pipeline until one of the following occurs:
|
|
|
|
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
|
|
|
* dependent child I/Os; (3) the I/O issues, so we're waiting
|
|
|
|
* for an I/O completion interrupt; (4) the I/O is delegated by
|
|
|
|
* vdev-level caching or aggregation; (5) the I/O is deferred
|
|
|
|
* due to vdev-level queueing; (6) the I/O is handed off to
|
|
|
|
* another thread. In all cases, the pipeline stops whenever
|
2013-07-03 06:00:16 +04:00
|
|
|
* there's no CPU work; it never burns a thread in cv_wait_io().
|
2008-12-03 23:09:06 +03:00
|
|
|
*
|
|
|
|
* There's no locking on io_stage because there's no legitimate way
|
|
|
|
* for multiple threads to be attempting to process the same I/O.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static zio_pipe_stage_t *zio_pipeline[];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
/*
|
|
|
|
* zio_execute() is a wrapper around the static function
|
|
|
|
* __zio_execute() so that we can force __zio_execute() to be
|
|
|
|
* inlined. This reduces stack overhead which is important
|
|
|
|
* because __zio_execute() is called recursively in several zio
|
|
|
|
* code paths. zio_execute() itself cannot be inlined because
|
|
|
|
* it is externally visible.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_execute(zio_t *zio)
|
2010-08-26 22:38:38 +04:00
|
|
|
{
|
2014-07-13 22:35:19 +04:00
|
|
|
fstrans_cookie_t cookie;
|
|
|
|
|
|
|
|
cookie = spl_fstrans_mark();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2014-07-13 22:35:19 +04:00
|
|
|
spl_fstrans_unmark(cookie);
|
2010-08-26 22:38:38 +04:00
|
|
|
}
|
|
|
|
|
2015-12-02 22:53:37 +03:00
|
|
|
/*
|
|
|
|
* Used to determine if in the current context the stack is sized large
|
|
|
|
* enough to allow zio_execute() to be called recursively. A minimum
|
|
|
|
* stack size of 16K is required to avoid needing to re-dispatch the zio.
|
|
|
|
*/
|
2020-06-15 21:30:37 +03:00
|
|
|
static boolean_t
|
2015-12-02 22:53:37 +03:00
|
|
|
zio_execute_stack_check(zio_t *zio)
|
|
|
|
{
|
|
|
|
#if !defined(HAVE_LARGE_STACKS)
|
|
|
|
dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
|
|
|
|
|
|
|
|
/* Executing in txg_sync_thread() context. */
|
|
|
|
if (dp && curthread == dp->dp_tx.tx_sync_thread)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
/* Pool initialization outside of zio_taskq context. */
|
|
|
|
if (dp && spa_is_initializing(dp->dp_spa) &&
|
|
|
|
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
|
|
|
|
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
|
|
|
|
return (B_TRUE);
|
|
|
|
#endif /* HAVE_LARGE_STACKS */
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline void
|
|
|
|
__zio_execute(zio_t *zio)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(zio->io_queued_timestamp, >, 0);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
while (zio->io_stage < ZIO_STAGE_DONE) {
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage pipeline = zio->io_pipeline;
|
|
|
|
enum zio_stage stage = zio->io_stage;
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
|
|
|
|
zio->io_executor = curthread;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(!MUTEX_HELD(&zio->io_lock));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(ISP2(stage));
|
|
|
|
ASSERT(zio->io_stall == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
do {
|
|
|
|
stage <<= 1;
|
|
|
|
} while ((stage & pipeline) == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
ASSERT(stage <= ZIO_STAGE_DONE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* If we are in interrupt context and this pipeline stage
|
|
|
|
* will grab a config lock that is held across I/O,
|
2010-05-29 00:45:14 +04:00
|
|
|
* or may wait for an I/O that needs an interrupt thread
|
|
|
|
* to complete, issue async to avoid deadlock.
|
|
|
|
*
|
|
|
|
* For VDEV_IO_START, we cut in line so that the io will
|
|
|
|
* be sent to disk promptly.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2012-12-18 04:23:27 +04:00
|
|
|
if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
|
|
|
|
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
|
2015-12-02 22:53:37 +03:00
|
|
|
boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
|
|
|
|
zio_requeue_io_start_cut_in_line : B_FALSE;
|
2012-12-18 04:23:27 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-12-02 22:53:37 +03:00
|
|
|
* If the current context doesn't have large enough stacks
|
|
|
|
* the zio must be issued asynchronously to prevent overflow.
|
2012-12-18 04:23:27 +04:00
|
|
|
*/
|
2015-12-02 22:53:37 +03:00
|
|
|
if (zio_execute_stack_check(zio)) {
|
|
|
|
boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
|
|
|
|
zio_requeue_io_start_cut_in_line : B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
|
2008-12-03 23:09:06 +03:00
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_stage = stage;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio->io_pipeline_trace |= zio->io_stage;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
/*
|
|
|
|
* The zio pipeline stage returns the next zio to execute
|
|
|
|
* (typically the same as this one), or NULL if we should
|
|
|
|
* stop.
|
|
|
|
*/
|
|
|
|
zio = zio_pipeline[highbit64(stage) - 1](zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
if (zio == NULL)
|
|
|
|
return;
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Initiate I/O, either sync or async
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zio_wait(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2020-02-29 01:49:44 +03:00
|
|
|
/*
|
|
|
|
* Some routines, like zio_free_sync(), may return a NULL zio
|
|
|
|
* to avoid the performance overhead of creating and then destroying
|
|
|
|
* an unneeded zio. For the callers' simplicity, we accept a NULL
|
|
|
|
* zio and ignore it.
|
|
|
|
*/
|
|
|
|
if (zio == NULL)
|
|
|
|
return (0);
|
|
|
|
|
2017-12-19 01:06:07 +03:00
|
|
|
long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
|
2008-12-03 23:09:06 +03:00
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
|
|
|
|
ASSERT3P(zio->io_executor, ==, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_waiter = curthread;
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT0(zio->io_queued_timestamp);
|
|
|
|
zio->io_queued_timestamp = gethrtime();
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
2017-12-19 01:06:07 +03:00
|
|
|
while (zio->io_executor != NULL) {
|
|
|
|
error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
|
|
|
|
ddi_get_lbolt() + timeout);
|
|
|
|
|
|
|
|
if (zfs_deadman_enabled && error == -1 &&
|
|
|
|
gethrtime() - zio->io_queued_timestamp >
|
|
|
|
spa_deadman_ziotime(zio->io_spa)) {
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
|
|
|
|
zio_deadman(zio, FTAG);
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
}
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&zio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
error = zio->io_error;
|
|
|
|
zio_destroy(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_nowait(zio_t *zio)
|
|
|
|
{
|
2020-02-29 01:49:44 +03:00
|
|
|
/*
|
|
|
|
* See comment in zio_wait().
|
|
|
|
*/
|
|
|
|
if (zio == NULL)
|
|
|
|
return;
|
|
|
|
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3P(zio->io_executor, ==, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
|
|
|
|
zio_unique_parent(zio) == NULL) {
|
2014-10-08 00:20:49 +04:00
|
|
|
zio_t *pio;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* This is a logical async I/O with no parent to wait for it.
|
2009-07-03 02:44:48 +04:00
|
|
|
* We add it to the spa_async_root_zio "Godfather" I/O which
|
|
|
|
* will ensure they complete prior to unloading the pool.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_t *spa = zio->io_spa;
|
2020-11-02 22:51:12 +03:00
|
|
|
pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2014-10-08 00:20:49 +04:00
|
|
|
zio_add_child(pio, zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT0(zio->io_queued_timestamp);
|
|
|
|
zio->io_queued_timestamp = gethrtime();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
* Reexecute, cancel, or suspend/resume failed I/O
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_reexecute(zio_t *pio)
|
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *cio, *cio_next;
|
|
|
|
|
|
|
|
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(pio->io_gang_leader == NULL);
|
|
|
|
ASSERT(pio->io_gang_tree == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_flags = pio->io_orig_flags;
|
|
|
|
pio->io_stage = pio->io_orig_stage;
|
|
|
|
pio->io_pipeline = pio->io_orig_pipeline;
|
|
|
|
pio->io_reexecute = 0;
|
2013-05-10 23:47:54 +04:00
|
|
|
pio->io_flags |= ZIO_FLAG_REEXECUTED;
|
2016-10-14 03:59:18 +03:00
|
|
|
pio->io_pipeline_trace = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_error = 0;
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_state[w] = 0;
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_child_error[c] = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (IO_IS_ALLOCATING(pio))
|
|
|
|
BP_ZERO(pio->io_bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* As we reexecute pio's children, new children could be created.
|
2009-02-18 23:51:31 +03:00
|
|
|
* New children go to the head of pio's io_child_list, however,
|
2008-12-03 23:09:06 +03:00
|
|
|
* so we will (correctly) not reexecute them. The key is that
|
2009-02-18 23:51:31 +03:00
|
|
|
* the remainder of pio's io_child_list, from 'cio_next' onward,
|
|
|
|
* cannot be affected by any side effects of reexecuting 'cio'.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2017-11-04 23:25:13 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2016-10-14 03:59:18 +03:00
|
|
|
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio, &zl);
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_children[cio->io_child_type][w]++;
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_reexecute(cio);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Now that all children have been reexecuted, execute the parent.
|
2009-07-03 02:44:48 +04:00
|
|
|
* We don't reexecute "The Godfather" I/O here as it's the
|
2017-02-17 22:48:20 +03:00
|
|
|
* responsibility of the caller to wait on it.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2016-10-14 03:59:18 +03:00
|
|
|
if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
|
|
|
|
pio->io_queued_timestamp = gethrtime();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(pio);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
2018-03-15 20:56:55 +03:00
|
|
|
zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
|
|
|
|
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
|
|
|
|
"failure and the failure mode property for this pool "
|
|
|
|
"is set to panic.", spa_name(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-07-11 02:13:09 +04:00
|
|
|
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
|
|
|
|
"failure and has been suspended.\n", spa_name(spa));
|
|
|
|
|
2020-09-01 05:35:11 +03:00
|
|
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
|
2020-09-04 20:34:28 +03:00
|
|
|
NULL, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (spa->spa_suspend_zio_root == NULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
|
|
|
|
ZIO_FLAG_GODFATHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-03-15 20:56:55 +03:00
|
|
|
spa->spa_suspended = reason;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio != NULL) {
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio != spa->spa_suspend_zio_root);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2009-02-18 23:51:31 +03:00
|
|
|
ASSERT(zio_unique_parent(zio) == NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_DONE);
|
|
|
|
zio_add_child(spa->spa_suspend_zio_root, zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_resume(spa_t *spa)
|
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *pio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Reexecute all previously suspended i/o.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
2018-03-15 20:56:55 +03:00
|
|
|
spa->spa_suspended = ZIO_SUSPEND_NONE;
|
2008-12-03 23:09:06 +03:00
|
|
|
cv_broadcast(&spa->spa_suspend_cv);
|
|
|
|
pio = spa->spa_suspend_zio_root;
|
|
|
|
spa->spa_suspend_zio_root = NULL;
|
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
|
|
|
|
|
|
|
if (pio == NULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
return (0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_reexecute(pio);
|
|
|
|
return (zio_wait(pio));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_resume_wait(spa_t *spa)
|
|
|
|
{
|
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
|
|
|
while (spa_suspended(spa))
|
|
|
|
cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
|
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Gang blocks.
|
|
|
|
*
|
|
|
|
* A gang block is a collection of small blocks that looks to the DMU
|
|
|
|
* like one large block. When zio_dva_allocate() cannot find a block
|
|
|
|
* of the requested size, due to either severe fragmentation or the pool
|
|
|
|
* being nearly full, it calls zio_write_gang_block() to construct the
|
|
|
|
* block from smaller fragments.
|
|
|
|
*
|
|
|
|
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
|
|
|
|
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
|
|
|
|
* an indirect block: it's an array of block pointers. It consumes
|
|
|
|
* only one sector and hence is allocatable regardless of fragmentation.
|
|
|
|
* The gang header's bps point to its gang members, which hold the data.
|
|
|
|
*
|
|
|
|
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
|
|
|
|
* as the verifier to ensure uniqueness of the SHA256 checksum.
|
|
|
|
* Critically, the gang block bp's blk_cksum is the checksum of the data,
|
|
|
|
* not the gang header. This ensures that data block signatures (needed for
|
|
|
|
* deduplication) are independent of how the block is physically stored.
|
|
|
|
*
|
|
|
|
* Gang blocks can be nested: a gang member may itself be a gang block.
|
|
|
|
* Thus every gang block is a tree in which root and all interior nodes are
|
|
|
|
* gang headers, and the leaves are normal blocks that contain user data.
|
|
|
|
* The root of the gang tree is called the gang leader.
|
|
|
|
*
|
|
|
|
* To perform any operation (read, rewrite, free, claim) on a gang block,
|
|
|
|
* zio_gang_assemble() first assembles the gang tree (minus data leaves)
|
|
|
|
* in the io_gang_tree field of the original logical i/o by recursively
|
|
|
|
* reading the gang leader and all gang headers below it. This yields
|
|
|
|
* an in-core tree containing the contents of every gang header and the
|
|
|
|
* bps for every constituent of the gang block.
|
|
|
|
*
|
|
|
|
* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
|
|
|
|
* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
|
|
|
|
* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
|
|
|
|
* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
|
|
|
|
* zio_read_gang() is a wrapper around zio_read() that omits reading gang
|
|
|
|
* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
|
|
|
|
* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
|
|
|
|
* of the gang header plus zio_checksum_compute() of the data to update the
|
|
|
|
* gang header's blk_cksum as described above.
|
|
|
|
*
|
|
|
|
* The two-phase assemble/issue model solves the problem of partial failure --
|
|
|
|
* what if you'd freed part of a gang block but then couldn't read the
|
|
|
|
* gang header for another part? Assembling the entire gang tree first
|
|
|
|
* ensures that all the necessary gang header I/O has succeeded before
|
|
|
|
* starting the actual work of free, claim, or write. Once the gang tree
|
|
|
|
* is assembled, free and claim are in-memory operations that cannot fail.
|
|
|
|
*
|
|
|
|
* In the event that a gang write fails, zio_dva_unallocate() walks the
|
|
|
|
* gang tree to immediately free (i.e. insert back into the space map)
|
|
|
|
* everything we've allocated. This ensures that we don't get ENOSPC
|
|
|
|
* errors during repeated suspend/resume cycles due to a flaky device.
|
|
|
|
*
|
|
|
|
* Gang rewrites only happen during sync-to-convergence. If we can't assemble
|
|
|
|
* the gang tree, we won't modify the block, so we can safely defer the free
|
|
|
|
* (knowing that the block is still intact). If we *can* assemble the gang
|
|
|
|
* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
|
|
|
|
* each constituent bp and we can allocate a new block on the next sync pass.
|
|
|
|
*
|
|
|
|
* In all cases, the gang tree allows complete recovery from partial failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
static void
|
|
|
|
zio_gang_issue_func_done(zio_t *zio)
|
|
|
|
{
|
2021-01-20 22:24:37 +03:00
|
|
|
abd_free(zio->io_abd);
|
2016-07-22 18:52:49 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static zio_t *
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn != NULL)
|
|
|
|
return (pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
|
|
|
|
BP_GET_PSIZE(bp), zio_gang_issue_func_done,
|
|
|
|
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
|
2008-12-03 23:09:06 +03:00
|
|
|
&pio->io_bookmark));
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
static zio_t *
|
|
|
|
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
if (gn != NULL) {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *gbh_abd =
|
|
|
|
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
|
|
|
|
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
|
|
|
|
&pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* As we rewrite each gang header, the pipeline will compute
|
|
|
|
* a new gang block header checksum for it; but no one will
|
|
|
|
* compute a new data checksum, so we do that here. The one
|
|
|
|
* exception is the gang leader: the pipeline already computed
|
|
|
|
* its data checksum because that stage precedes gang assembly.
|
|
|
|
* (Presently, nothing actually uses interior data checksums;
|
|
|
|
* this is just good hygiene.)
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (gn != pio->io_gang_leader->io_gang_tree) {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *buf = abd_get_offset(data, offset);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
|
2016-07-22 18:52:49 +03:00
|
|
|
buf, BP_GET_PSIZE(bp));
|
|
|
|
|
2021-01-20 22:24:37 +03:00
|
|
|
abd_free(buf);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* If we are here to damage data for testing purposes,
|
|
|
|
* leave the GBH alone so that we can detect the damage.
|
|
|
|
*/
|
|
|
|
if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
|
|
|
|
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_get_offset(data, offset), BP_GET_PSIZE(bp),
|
|
|
|
zio_gang_issue_func_done, NULL, pio->io_priority,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* ARGSUSED */
|
2016-07-22 18:52:49 +03:00
|
|
|
static zio_t *
|
|
|
|
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2020-02-29 01:49:44 +03:00
|
|
|
zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio));
|
|
|
|
if (zio == NULL) {
|
|
|
|
zio = zio_null(pio, pio->io_spa,
|
|
|
|
NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
|
|
|
|
}
|
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* ARGSUSED */
|
2016-07-22 18:52:49 +03:00
|
|
|
static zio_t *
|
|
|
|
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
|
|
|
|
NULL,
|
|
|
|
zio_read_gang,
|
|
|
|
zio_rewrite_gang,
|
|
|
|
zio_free_gang,
|
|
|
|
zio_claim_gang,
|
|
|
|
NULL
|
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void zio_gang_tree_assemble_done(zio_t *zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static zio_gang_node_t *
|
|
|
|
zio_gang_node_alloc(zio_gang_node_t **gnpp)
|
|
|
|
{
|
|
|
|
zio_gang_node_t *gn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(*gnpp == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
|
2008-12-03 23:09:06 +03:00
|
|
|
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
|
|
|
|
*gnpp = gn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (gn);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_free(zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = *gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(gn->gn_child[g] == NULL);
|
|
|
|
|
|
|
|
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
|
|
|
kmem_free(gn, sizeof (*gn));
|
|
|
|
*gnpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_free(zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = *gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn == NULL)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_tree_free(&gn->gn_child[g]);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_free(gnpp);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(gio->io_gang_leader == gio);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_IS_GANG(bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
|
|
|
|
zio_gang_tree_assemble_done, gn, gio->io_priority,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_assemble_done(zio_t *zio)
|
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = zio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = zio->io_private;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(gio == zio_unique_parent(zio));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_count == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
/* this ABD was created from a linear buf in zio_gang_tree_assemble */
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_SHOULD_BYTESWAP(bp))
|
2016-07-22 18:52:49 +03:00
|
|
|
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2021-01-20 22:24:37 +03:00
|
|
|
abd_free(zio->io_abd);
|
2016-07-22 18:52:49 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
|
|
|
|
if (!BP_IS_GANG(gbp))
|
|
|
|
continue;
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_IS_GANG(bp) == !!gn);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
|
|
|
|
ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If you're a gang header, your data is in gn->gn_gbh.
|
|
|
|
* If you're a gang member, your data is in 'data' and gn == NULL.
|
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
|
|
|
|
if (BP_IS_HOLE(gbp))
|
|
|
|
continue;
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
|
|
|
|
offset);
|
|
|
|
offset += BP_GET_PSIZE(gbp);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (gn == gio->io_gang_tree)
|
2016-07-22 18:52:49 +03:00
|
|
|
ASSERT3U(gio->io_size, ==, offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio != pio)
|
|
|
|
zio_nowait(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_assemble(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
zio->io_gang_leader = zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_issue(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
|
|
|
|
0);
|
2008-12-03 23:09:06 +03:00
|
|
|
else
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_free(&zio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_gang_member_ready(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
dva_t *cdva = zio->io_bp->blk_dva;
|
|
|
|
dva_t *pdva = pio->io_bp->blk_dva;
|
|
|
|
uint64_t asize;
|
2019-12-05 23:37:00 +03:00
|
|
|
zio_t *gio __maybe_unused = zio->io_gang_leader;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_IS_HOLE(zio->io_bp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
|
|
|
|
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
|
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
|
|
|
|
ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(DVA_GET_GANG(&pdva[d]));
|
|
|
|
asize = DVA_GET_ASIZE(&pdva[d]);
|
|
|
|
asize += DVA_GET_ASIZE(&cdva[d]);
|
|
|
|
DVA_SET_ASIZE(&pdva[d], asize);
|
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
static void
|
|
|
|
zio_write_gang_done(zio_t *zio)
|
|
|
|
{
|
2018-03-15 22:47:26 +03:00
|
|
|
/*
|
|
|
|
* The io_abd field will be NULL for a zio with no data. The io_flags
|
|
|
|
* will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
|
|
|
|
* check for it here as it is cleared in zio_ready.
|
|
|
|
*/
|
|
|
|
if (zio->io_abd != NULL)
|
2021-01-20 22:24:37 +03:00
|
|
|
abd_free(zio->io_abd);
|
2016-07-22 18:52:49 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_t *spa = pio->io_spa;
|
|
|
|
blkptr_t *bp = pio->io_bp;
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
|
|
|
zio_gang_node_t *gn, **gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_gbh_phys_t *gbh;
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *gbh_abd;
|
2008-12-03 23:09:06 +03:00
|
|
|
uint64_t txg = pio->io_txg;
|
|
|
|
uint64_t resid = pio->io_size;
|
|
|
|
uint64_t lsize;
|
2010-05-29 00:45:14 +04:00
|
|
|
int copies = gio->io_prop.zp_copies;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
int gbh_copies;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_prop_t zp;
|
2017-11-04 23:25:13 +03:00
|
|
|
int error;
|
2018-03-15 22:47:26 +03:00
|
|
|
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* encrypted blocks need DVA[2] free so encrypted gang headers can't
|
|
|
|
* have a third copy.
|
|
|
|
*/
|
|
|
|
gbh_copies = MIN(copies + 1, spa_max_replication(spa));
|
|
|
|
if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
|
|
|
|
gbh_copies = SPA_DVAS_PER_BP - 1;
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
|
2016-10-14 03:59:18 +03:00
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
2018-03-15 22:47:26 +03:00
|
|
|
ASSERT(has_data);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
flags |= METASLAB_ASYNC_ALLOC;
|
2020-12-15 21:55:44 +03:00
|
|
|
VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
|
|
|
|
mca_alloc_slots, pio));
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The logical zio has already placed a reservation for
|
|
|
|
* 'copies' allocation slots but gang blocks may require
|
|
|
|
* additional copies. These additional copies
|
|
|
|
* (i.e. gbh_copies - copies) are guaranteed to succeed
|
|
|
|
* since metaslab_class_throttle_reserve() always allows
|
|
|
|
* additional reservations for gang blocks.
|
|
|
|
*/
|
|
|
|
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
pio->io_allocator, pio, flags));
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
|
2017-01-12 22:52:56 +03:00
|
|
|
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
&pio->io_alloc_list, pio, pio->io_allocator);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
2016-10-14 03:59:18 +03:00
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
2018-03-15 22:47:26 +03:00
|
|
|
ASSERT(has_data);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we failed to allocate the gang block header then
|
|
|
|
* we remove any additional allocation reservations that
|
|
|
|
* we placed here. The original reservation will
|
|
|
|
* be removed when the logical I/O goes to the ready
|
|
|
|
* stage.
|
|
|
|
*/
|
|
|
|
metaslab_class_throttle_unreserve(mc,
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
gbh_copies - copies, pio->io_allocator, pio);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_error = error;
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (pio == gio) {
|
|
|
|
gnpp = &gio->io_gang_tree;
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
gnpp = pio->io_private;
|
|
|
|
ASSERT(pio->io_ready == zio_write_gang_member_ready);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
gn = zio_gang_node_alloc(gnpp);
|
|
|
|
gbh = gn->gn_gbh;
|
|
|
|
bzero(gbh, SPA_GANGBLOCKSIZE);
|
2016-07-22 18:52:49 +03:00
|
|
|
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Create the gang header.
|
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
|
|
|
|
zio_write_gang_done, NULL, pio->io_priority,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Create and nowait the gang children.
|
|
|
|
*/
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; resid != 0; resid -= lsize, g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
|
|
|
|
SPA_MINBLOCKSIZE);
|
|
|
|
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zp.zp_checksum = gio->io_prop.zp_checksum;
|
2008-12-03 23:09:06 +03:00
|
|
|
zp.zp_compress = ZIO_COMPRESS_OFF;
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 20:10:17 +03:00
|
|
|
zp.zp_complevel = gio->io_prop.zp_complevel;
|
2008-12-03 23:09:06 +03:00
|
|
|
zp.zp_type = DMU_OT_NONE;
|
|
|
|
zp.zp_level = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
zp.zp_copies = gio->io_prop.zp_copies;
|
2013-05-10 23:47:54 +04:00
|
|
|
zp.zp_dedup = B_FALSE;
|
|
|
|
zp.zp_dedup_verify = B_FALSE;
|
|
|
|
zp.zp_nopwrite = B_FALSE;
|
2017-09-12 23:15:11 +03:00
|
|
|
zp.zp_encrypt = gio->io_prop.zp_encrypt;
|
|
|
|
zp.zp_byteorder = gio->io_prop.zp_byteorder;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
|
|
|
|
bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
|
|
|
|
bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
|
2018-03-15 22:47:26 +03:00
|
|
|
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
|
|
|
|
resid) : NULL, lsize, lsize, &zp,
|
|
|
|
zio_write_gang_member_ready, NULL, NULL,
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
|
2016-10-14 03:59:18 +03:00
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
|
|
|
|
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
2018-03-15 22:47:26 +03:00
|
|
|
ASSERT(has_data);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Gang children won't throttle but we should
|
|
|
|
* account for their work, so reserve an allocation
|
|
|
|
* slot for them here.
|
|
|
|
*/
|
|
|
|
VERIFY(metaslab_class_throttle_reserve(mc,
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zp.zp_copies, cio->io_allocator, cio, flags));
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
zio_nowait(cio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Set pio's pipeline to just wait for zio to finish.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
/*
|
|
|
|
* We didn't allocate this bp, so make sure it doesn't get unmarked.
|
|
|
|
*/
|
|
|
|
pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_nowait(zio);
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
2016-06-16 01:47:05 +03:00
|
|
|
* The zio_nop_write stage in the pipeline determines if allocating a
|
|
|
|
* new bp is necessary. The nopwrite feature can handle writes in
|
|
|
|
* either syncing or open context (i.e. zil writes) and as a result is
|
|
|
|
* mutually exclusive with dedup.
|
|
|
|
*
|
|
|
|
* By leveraging a cryptographically secure checksum, such as SHA256, we
|
|
|
|
* can compare the checksums of the new data and the old to determine if
|
|
|
|
* allocating a new block is required. Note that our requirements for
|
|
|
|
* cryptographic strength are fairly weak: there can't be any accidental
|
|
|
|
* hash collisions, but we don't need to be secure against intentional
|
|
|
|
* (malicious) collisions. To trigger a nopwrite, you have to be able
|
|
|
|
* to write the file to begin with, and triggering an incorrect (hash
|
|
|
|
* collision) nopwrite is no worse than simply writing to the file.
|
|
|
|
* That said, there are no known attacks against the checksum algorithms
|
|
|
|
* used for nopwrite, assuming that the salt and the checksums
|
|
|
|
* themselves remain secret.
|
2013-05-10 23:47:54 +04:00
|
|
|
*/
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_nop_write(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_LEVEL(bp) == 0);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
ASSERT(zp->zp_nopwrite);
|
|
|
|
ASSERT(!zp->zp_dedup);
|
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check to see if the original bp and the new bp have matching
|
|
|
|
* characteristics (i.e. same checksum, compression algorithms, etc).
|
|
|
|
* If they don't then just continue with the pipeline which will
|
|
|
|
* allocate a new bp.
|
|
|
|
*/
|
|
|
|
if (BP_IS_HOLE(bp_orig) ||
|
2016-06-16 01:47:05 +03:00
|
|
|
!(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_NOPWRITE) ||
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
|
2013-05-10 23:47:54 +04:00
|
|
|
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
|
|
|
|
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
|
|
|
|
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
|
|
|
|
zp->zp_copies != BP_GET_NDVAS(bp_orig))
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2013-05-10 23:47:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the checksums match then reset the pipeline so that we
|
|
|
|
* avoid allocating a new bp and issuing any I/O.
|
|
|
|
*/
|
|
|
|
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
|
2016-06-16 01:47:05 +03:00
|
|
|
ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_NOPWRITE);
|
2013-05-10 23:47:54 +04:00
|
|
|
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
|
|
|
|
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
|
|
|
|
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
|
|
|
|
ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
|
|
|
|
sizeof (uint64_t)) == 0);
|
|
|
|
|
2019-06-28 22:40:24 +03:00
|
|
|
/*
|
|
|
|
* If we're overwriting a block that is currently on an
|
|
|
|
* indirect vdev, then ignore the nopwrite request and
|
|
|
|
* allow a new block to be allocated on a concrete vdev.
|
|
|
|
*/
|
|
|
|
spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
|
|
|
|
vdev_t *tvd = vdev_lookup_top(zio->io_spa,
|
|
|
|
DVA_GET_VDEV(&bp->blk_dva[0]));
|
|
|
|
if (tvd->vdev_ops == &vdev_indirect_ops) {
|
|
|
|
spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
*bp = *bp_orig;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2013-05-10 23:47:54 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2010-05-29 00:45:14 +04:00
|
|
|
* Dedup
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
zio_ddt_child_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
ddp = ddt_phys_select(dde, bp);
|
|
|
|
if (zio->io_error == 0)
|
|
|
|
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
|
2016-07-22 18:52:49 +03:00
|
|
|
|
|
|
|
if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
|
|
|
|
dde->dde_repair_abd = zio->io_abd;
|
2010-05-29 00:45:14 +04:00
|
|
|
else
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(zio->io_abd);
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_read_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = ddt_repair_start(ddt, bp);
|
|
|
|
ddt_phys_t *ddp = dde->dde_phys;
|
|
|
|
ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
|
|
|
|
blkptr_t blk;
|
|
|
|
|
|
|
|
ASSERT(zio->io_vsd == NULL);
|
|
|
|
zio->io_vsd = dde;
|
|
|
|
|
|
|
|
if (ddp_self == NULL)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
|
|
|
|
continue;
|
|
|
|
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
|
|
|
|
&blk);
|
|
|
|
zio_nowait(zio_read(zio, zio->io_spa, &blk,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_alloc_for_io(zio->io_size, B_TRUE),
|
|
|
|
zio->io_size, zio_ddt_child_read_done, dde,
|
|
|
|
zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
|
|
|
|
ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_nowait(zio_read(zio, zio->io_spa, bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = zio->io_vsd;
|
|
|
|
if (ddt == NULL) {
|
|
|
|
ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
if (dde == NULL) {
|
|
|
|
zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2016-07-22 18:52:49 +03:00
|
|
|
if (dde->dde_repair_abd != NULL) {
|
|
|
|
abd_copy(zio->io_abd, dde->dde_repair_abd,
|
|
|
|
zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_child_error[ZIO_CHILD_DDT] = 0;
|
|
|
|
}
|
|
|
|
ddt_repair_done(ddt, dde);
|
|
|
|
zio->io_vsd = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(zio->io_vsd == NULL);
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
|
|
|
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2016-09-13 04:34:19 +03:00
|
|
|
boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
ASSERT(!(zio->io_bp_override && do_raw));
|
2016-07-11 20:45:52 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Note: we compare the original data, not the transformed data,
|
|
|
|
* because when zio->io_bp is an override bp, we will not have
|
|
|
|
* pushed the I/O transforms. That's an important optimization
|
|
|
|
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
|
2016-09-13 04:34:19 +03:00
|
|
|
* However, we should never get a raw, override zio so in these
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* cases we can compare the io_abd directly. This is useful because
|
2016-09-13 04:34:19 +03:00
|
|
|
* it allows us to do dedup verification even if we don't have access
|
|
|
|
* to the original data (for instance, if the encryption keys aren't
|
|
|
|
* loaded).
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
2016-09-13 04:34:19 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_t *lio = dde->dde_lead_zio[p];
|
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
if (lio != NULL && do_raw) {
|
|
|
|
return (lio->io_size != zio->io_size ||
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_cmp(zio->io_abd, lio->io_abd) != 0);
|
2016-09-13 04:34:19 +03:00
|
|
|
} else if (lio != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
return (lio->io_orig_size != zio->io_orig_size ||
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
if (ddp->ddp_phys_birth != 0 && do_raw) {
|
|
|
|
blkptr_t blk = *zio->io_bp;
|
|
|
|
uint64_t psize;
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *tmpabd;
|
2016-09-13 04:34:19 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
|
|
|
psize = BP_GET_PSIZE(&blk);
|
|
|
|
|
|
|
|
if (psize != zio->io_size)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
tmpabd = abd_alloc_for_io(psize, B_TRUE);
|
2016-09-13 04:34:19 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
|
2016-09-13 04:34:19 +03:00
|
|
|
psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
|
|
|
|
ZIO_FLAG_RAW, &zio->io_bookmark));
|
|
|
|
|
|
|
|
if (error == 0) {
|
2016-07-22 18:52:49 +03:00
|
|
|
if (abd_cmp(tmpabd, zio->io_abd) != 0)
|
2016-09-13 04:34:19 +03:00
|
|
|
error = SET_ERROR(ENOENT);
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(tmpabd);
|
2016-09-13 04:34:19 +03:00
|
|
|
ddt_enter(ddt);
|
|
|
|
return (error != 0);
|
|
|
|
} else if (ddp->ddp_phys_birth != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
arc_buf_t *abuf = NULL;
|
2014-12-06 20:24:32 +03:00
|
|
|
arc_flags_t aflags = ARC_FLAG_WAIT;
|
2010-05-29 00:45:14 +04:00
|
|
|
blkptr_t blk = *zio->io_bp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_exit(ddt);
|
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
error = arc_read(NULL, spa, &blk,
|
2010-05-29 00:45:14 +04:00
|
|
|
arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&aflags, &zio->io_bookmark);
|
|
|
|
|
|
|
|
if (error == 0) {
|
2016-07-22 18:52:49 +03:00
|
|
|
if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_orig_size) != 0)
|
2016-09-13 04:34:19 +03:00
|
|
|
error = SET_ERROR(ENOENT);
|
2016-06-02 07:04:53 +03:00
|
|
|
arc_buf_destroy(abuf, &abuf);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
return (error != 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_child_write_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = zio->io_prop.zp_copies;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
zio_t *pio;
|
|
|
|
|
|
|
|
if (zio->io_error)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
|
|
|
|
ddt_phys_fill(ddp, zio->io_bp);
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2016-10-14 03:59:18 +03:00
|
|
|
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_child_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = zio->io_prop.zp_copies;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(ddp->ddp_refcnt == 0);
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
dde->dde_lead_zio[p] = NULL;
|
|
|
|
|
|
|
|
if (zio->io_error == 0) {
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
while (zio_walk_parents(zio, &zl) != NULL)
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else {
|
|
|
|
ddt_phys_clear(ddp);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_write(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t txg = zio->io_txg;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
int p = zp->zp_copies;
|
|
|
|
zio_t *cio = NULL;
|
|
|
|
ddt_t *ddt = ddt_select(spa, bp);
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
|
|
|
|
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
|
2016-09-13 04:34:19 +03:00
|
|
|
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
dde = ddt_lookup(ddt, bp, B_TRUE);
|
|
|
|
ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
|
|
|
|
/*
|
|
|
|
* If we're using a weak checksum, upgrade to a strong checksum
|
|
|
|
* and try again. If we're already using a strong checksum,
|
|
|
|
* we can't resolve it, so just convert to an ordinary write.
|
|
|
|
* (And automatically e-mail a paper to Nature?)
|
|
|
|
*/
|
2016-06-16 01:47:05 +03:00
|
|
|
if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_DEDUP)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zp->zp_checksum = spa_dedup_checksum(spa);
|
|
|
|
zio_pop_transforms(zio);
|
|
|
|
zio->io_stage = ZIO_STAGE_OPEN;
|
|
|
|
BP_ZERO(bp);
|
|
|
|
} else {
|
2013-05-10 23:47:54 +04:00
|
|
|
zp->zp_dedup = B_FALSE;
|
2019-06-21 04:30:40 +03:00
|
|
|
BP_SET_DEDUP(bp, B_FALSE);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2019-06-21 04:30:40 +03:00
|
|
|
ASSERT(!BP_GET_DEDUP(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
ddt_exit(ddt);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
|
|
|
|
if (ddp->ddp_phys_birth != 0)
|
|
|
|
ddt_bp_fill(ddp, bp, txg);
|
|
|
|
if (dde->dde_lead_zio[p] != NULL)
|
|
|
|
zio_add_child(zio, dde->dde_lead_zio[p]);
|
|
|
|
else
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else if (zio->io_bp_override) {
|
|
|
|
ASSERT(bp->blk_birth == txg);
|
|
|
|
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
|
|
|
|
ddt_phys_fill(ddp, bp);
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else {
|
2016-07-22 18:52:49 +03:00
|
|
|
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
2016-07-11 20:45:52 +03:00
|
|
|
zio->io_orig_size, zio->io_orig_size, zp,
|
2016-05-15 18:02:28 +03:00
|
|
|
zio_ddt_child_write_ready, NULL, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_child_write_done, dde, zio->io_priority,
|
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
dde->dde_lead_zio[p] = cio;
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
|
2020-02-29 01:49:44 +03:00
|
|
|
zio_nowait(cio);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ddt_entry_t *freedde; /* for debugging */
|
2008-12-03 23:09:06 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_free(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_t *ddt = ddt_select(spa, bp);
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
|
2013-03-19 23:05:08 +04:00
|
|
|
if (dde) {
|
|
|
|
ddp = ddt_phys_select(dde, bp);
|
|
|
|
if (ddp)
|
|
|
|
ddt_phys_decref(ddp);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_exit(ddt);
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Allocate and free blocks
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
static zio_t *
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zio_io_to_allocate(spa_t *spa, int allocator)
|
2016-10-14 03:59:18 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
|
2016-10-14 03:59:18 +03:00
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zio = avl_first(&spa->spa_alloc_trees[allocator]);
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to place a reservation for this zio. If we're unable to
|
|
|
|
* reserve then we throttle.
|
|
|
|
*/
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
ASSERT3U(zio->io_allocator, ==, allocator);
|
2018-09-06 04:33:36 +03:00
|
|
|
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
|
2016-10-14 03:59:18 +03:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
avl_remove(&spa->spa_alloc_trees[allocator], zio);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_dva_throttle(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
zio_t *nio;
|
2018-09-06 04:33:36 +03:00
|
|
|
metaslab_class_t *mc;
|
|
|
|
|
|
|
|
/* locate an appropriate allocation class */
|
|
|
|
mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
|
|
|
|
zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
|
2018-09-06 04:33:36 +03:00
|
|
|
!mc->mc_alloc_throttle_enabled ||
|
2016-10-14 03:59:18 +03:00
|
|
|
zio->io_child_type == ZIO_CHILD_GANG ||
|
|
|
|
zio->io_flags & ZIO_FLAG_NODATA) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
ASSERT3U(zio->io_queued_timestamp, >, 0);
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
|
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zbookmark_phys_t *bm = &zio->io_bookmark;
|
|
|
|
/*
|
|
|
|
* We want to try to use as many allocators as possible to help improve
|
|
|
|
* performance, but we also want logically adjacent IOs to be physically
|
|
|
|
* adjacent to improve sequential read performance. We chunk each object
|
|
|
|
* into 2^20 block regions, and then hash based on the objset, object,
|
|
|
|
* level, and region to accomplish both of these goals.
|
|
|
|
*/
|
|
|
|
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
|
|
|
|
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
|
|
|
|
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
2018-09-06 04:33:36 +03:00
|
|
|
zio->io_metaslab_class = mc;
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
|
2018-09-06 04:33:36 +03:00
|
|
|
nio = zio_io_to_allocate(spa, zio->io_allocator);
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (nio);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
static void
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zio_allocate_dispatch(spa_t *spa, int allocator)
|
2016-10-14 03:59:18 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
mutex_enter(&spa->spa_alloc_locks[allocator]);
|
|
|
|
zio = zio_io_to_allocate(spa, allocator);
|
|
|
|
mutex_exit(&spa->spa_alloc_locks[allocator]);
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
|
|
|
|
ASSERT0(zio->io_error);
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_dva_allocate(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2018-09-06 04:33:36 +03:00
|
|
|
metaslab_class_t *mc;
|
2008-11-20 23:01:55 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int error;
|
2011-07-26 23:08:52 +04:00
|
|
|
int flags = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_gang_leader == NULL) {
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
zio->io_gang_leader = zio;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(BP_IS_HOLE(bp));
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(BP_GET_NDVAS(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, >, 0);
|
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NODATA)
|
|
|
|
flags |= METASLAB_DONT_THROTTLE;
|
|
|
|
if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
|
|
|
|
flags |= METASLAB_GANG_CHILD;
|
|
|
|
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
|
|
|
|
flags |= METASLAB_ASYNC_ALLOC;
|
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
/*
|
|
|
|
* if not already chosen, locate an appropriate allocation class
|
|
|
|
*/
|
|
|
|
mc = zio->io_metaslab_class;
|
|
|
|
if (mc == NULL) {
|
|
|
|
mc = spa_preferred_class(spa, zio->io_size,
|
|
|
|
zio->io_prop.zp_type, zio->io_prop.zp_level,
|
|
|
|
zio->io_prop.zp_zpl_smallblk);
|
|
|
|
zio->io_metaslab_class = mc;
|
|
|
|
}
|
|
|
|
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
/*
|
|
|
|
* Try allocating the block in the usual metaslab class.
|
|
|
|
* If that's full, allocate it in the normal class.
|
|
|
|
* If that's full, allocate as a gang block,
|
|
|
|
* and if all are full, the allocation fails (which shouldn't happen).
|
|
|
|
*
|
|
|
|
* Note that we do not fall back on embedded slog (ZIL) space, to
|
|
|
|
* preserve unfragmented slog space, which is critical for decent
|
|
|
|
* sync write performance. If a log allocation fails, we will fall
|
|
|
|
* back to spa_sync() which is abysmal for performance.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
2017-01-12 22:52:56 +03:00
|
|
|
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
&zio->io_alloc_list, zio, zio->io_allocator);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
/*
|
|
|
|
* Fallback to normal class when an alloc class is full
|
|
|
|
*/
|
|
|
|
if (error == ENOSPC && mc != spa_normal_class(spa)) {
|
|
|
|
/*
|
|
|
|
* If throttling, transfer reservation over to normal class.
|
|
|
|
* The io_allocator slot can remain the same even though we
|
|
|
|
* are switching classes.
|
|
|
|
*/
|
|
|
|
if (mc->mc_alloc_throttle_enabled &&
|
|
|
|
(zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
|
|
|
|
metaslab_class_throttle_unreserve(mc,
|
|
|
|
zio->io_prop.zp_copies, zio->io_allocator, zio);
|
|
|
|
zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
|
|
|
|
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
VERIFY(metaslab_class_throttle_reserve(
|
|
|
|
spa_normal_class(spa),
|
2018-09-06 04:33:36 +03:00
|
|
|
zio->io_prop.zp_copies, zio->io_allocator, zio,
|
|
|
|
flags | METASLAB_MUST_RESERVE));
|
|
|
|
}
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
zio->io_metaslab_class = mc = spa_normal_class(spa);
|
|
|
|
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
|
|
|
|
zfs_dbgmsg("%s: metaslab allocation failure, "
|
|
|
|
"trying normal class: zio %px, size %llu, error %d",
|
|
|
|
spa_name(spa), zio, zio->io_size, error);
|
|
|
|
}
|
2018-09-06 04:33:36 +03:00
|
|
|
|
|
|
|
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
|
|
|
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
|
|
|
|
&zio->io_alloc_list, zio, zio->io_allocator);
|
|
|
|
}
|
|
|
|
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
|
|
|
|
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
|
|
|
|
zfs_dbgmsg("%s: metaslab allocation failure, "
|
|
|
|
"trying ganging: zio %px, size %llu, error %d",
|
|
|
|
spa_name(spa), zio, zio->io_size, error);
|
|
|
|
}
|
|
|
|
return (zio_write_gang_block(zio, mc));
|
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
if (error != 0) {
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
if (error != ENOSPC ||
|
|
|
|
(zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
|
|
|
|
zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
|
|
|
|
"size %llu, error %d",
|
|
|
|
spa_name(spa), zio, zio->io_size, error);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = error;
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_dva_free(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_dva_claim(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
|
|
|
|
if (error)
|
|
|
|
zio->io_error = error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Undo an allocation. This is used by zio_done() when an I/O fails
|
|
|
|
* and we want to give back the block we just allocated.
|
|
|
|
* This handles both normal blocks and gang blocks.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
|
|
|
|
{
|
|
|
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (!BP_IS_HOLE(bp))
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (gn != NULL) {
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_dva_unallocate(zio, gn->gn_child[g],
|
|
|
|
&gn->gn_gbh->zg_blkptr[g]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to allocate an intent log block. Return 0 on success, errno on failure.
|
|
|
|
*/
|
|
|
|
int
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
|
|
|
|
uint64_t size, boolean_t *slog)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
int error = 1;
|
2017-01-12 22:52:56 +03:00
|
|
|
zio_alloc_list_t io_alloc_list;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(txg > spa_syncing_txg(spa));
|
|
|
|
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_init(&io_alloc_list);
|
2018-09-06 04:33:36 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Block pointer fields are useful to metaslabs for stats and debugging.
|
|
|
|
* Fill in the obvious ones before calling into metaslab_alloc().
|
|
|
|
*/
|
|
|
|
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
|
|
|
|
BP_SET_PSIZE(new_bp, size);
|
|
|
|
BP_SET_LEVEL(new_bp, 0);
|
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
/*
|
|
|
|
* When allocating a zil block, we don't have information about
|
|
|
|
* the final destination of the block except the objset it's part
|
|
|
|
* of, so we just hash the objset ID to pick the allocator to get
|
|
|
|
* some parallelism.
|
|
|
|
*/
|
Only examine best metaslabs on each vdev
On a system with very high fragmentation, we may need to do lots of gang
allocations (e.g. most indirect block allocations (~50KB) may need to
gang). Before failing a "normal" allocation and resorting to ganging, we
try every metaslab. This has the impact of loading every metaslab (not
a huge deal since we now typically keep all metaslabs loaded), and also
iterating over every metaslab for every failing allocation. If there are
many metaslabs (more than the typical ~200, e.g. due to vdev expansion
or very large vdevs), the CPU cost of this iteration can be very
impactful. This iteration is done with the mg_lock held, creating long
hold times and high lock contention for concurrent allocations,
ultimately causing long txg sync times and poor application performance.
To address this, this commit changes the behavior of "normal" (not
try_hard, not ZIL) allocations. These will now only examine the 100
best metaslabs (as determined by their ms_weight). If none of these
have a large enough free segment, then the allocation will fail and
we'll fall back on ganging.
To accomplish this, we will now (normally) gang before doing a
`try_hard` allocation. Non-try_hard allocations will only examine the
100 best metaslabs of each vdev. In summary, we will first try normal
allocation. If that fails then we will do a gang allocation. If that
fails then we will do a "try hard" gang allocation. If that fails then
we will have a multi-layer gang block.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11327
2020-12-17 01:40:05 +03:00
|
|
|
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
|
|
|
|
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
|
|
|
|
spa->spa_alloc_count;
|
Set aside a metaslab for ZIL blocks
Mixing ZIL and normal allocations has several problems:
1. The ZIL allocations are allocated, written to disk, and then a few
seconds later freed. This leaves behind holes (free segments) where the
ZIL blocks used to be, which increases fragmentation, which negatively
impacts performance.
2. When under moderate load, ZIL allocations are of 128KB. If the pool
is fairly fragmented, there may not be many free chunks of that size.
This causes ZFS to load more metaslabs to locate free segments of 128KB
or more. The loading happens synchronously (from zil_commit()), and can
take around a second even if the metaslab's spacemap is cached in the
ARC. All concurrent synchronous operations on this filesystem must wait
while the metaslab is loading. This can cause a significant performance
impact.
3. If the pool is very fragmented, there may be zero free chunks of
128KB or more. In this case, the ZIL falls back to txg_wait_synced(),
which has an enormous performance impact.
These problems can be eliminated by using a dedicated log device
("slog"), even one with the same performance characteristics as the
normal devices.
This change sets aside one metaslab from each top-level vdev that is
preferentially used for ZIL allocations (vdev_log_mg,
spa_embedded_log_class). From an allocation perspective, this is
similar to having a dedicated log device, and it eliminates the
above-mentioned performance problems.
Log (ZIL) blocks can be allocated from the following locations. Each
one is tried in order until the allocation succeeds:
1. dedicated log vdevs, aka "slog" (spa_log_class)
2. embedded slog metaslabs (spa_embedded_log_class)
3. other metaslabs in normal vdevs (spa_normal_class)
The space required for the embedded slog metaslabs is usually between
0.5% and 1.0% of the pool, and comes out of the existing 3.2% of "slop"
space that is not available for user data.
On an all-ssd system with 4TB storage, 87% fragmentation, 60% capacity,
and recordsize=8k, testing shows a ~50% performance increase on random
8k sync writes. On even more fragmented systems (which hit problem #3
above and call txg_wait_synced()), the performance improvement can be
arbitrarily large (>100x).
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11389
2021-01-22 02:12:54 +03:00
|
|
|
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
|
|
|
|
txg, NULL, flags, &io_alloc_list, NULL, allocator);
|
|
|
|
*slog = (error == 0);
|
|
|
|
if (error != 0) {
|
|
|
|
error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
|
|
|
|
new_bp, 1, txg, NULL, flags,
|
|
|
|
&io_alloc_list, NULL, allocator);
|
|
|
|
}
|
|
|
|
if (error != 0) {
|
|
|
|
error = metaslab_alloc(spa, spa_normal_class(spa), size,
|
|
|
|
new_bp, 1, txg, NULL, flags,
|
|
|
|
&io_alloc_list, NULL, allocator);
|
2012-04-20 02:55:28 +04:00
|
|
|
}
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_fini(&io_alloc_list);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
BP_SET_LSIZE(new_bp, size);
|
|
|
|
BP_SET_PSIZE(new_bp, size);
|
|
|
|
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_CHECKSUM(new_bp,
|
|
|
|
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
|
|
|
|
? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
|
|
|
|
BP_SET_LEVEL(new_bp, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(new_bp, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* encrypted blocks will require an IV and salt. We generate
|
|
|
|
* these now since we will not be rewriting the bp at
|
|
|
|
* rewrite time.
|
|
|
|
*/
|
|
|
|
if (os->os_encrypted) {
|
|
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
|
|
|
|
|
|
BP_SET_CRYPT(new_bp, B_TRUE);
|
|
|
|
VERIFY0(spa_crypt_get_salt(spa,
|
|
|
|
dmu_objset_id(os), salt));
|
|
|
|
VERIFY0(zio_crypt_generate_iv(iv));
|
|
|
|
|
|
|
|
zio_crypt_encode_params_bp(new_bp, salt, iv);
|
|
|
|
}
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
} else {
|
|
|
|
zfs_dbgmsg("%s: zil block allocation failure: "
|
|
|
|
"size %llu, error %d", spa_name(spa), size, error);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Read and write to physical devices
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2014-10-21 02:07:45 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Issue an I/O to the underlying vdev. Typically the issue pipeline
|
|
|
|
* stops after this stage and will resume upon I/O completion.
|
|
|
|
* However, there are instances where the vdev layer may need to
|
|
|
|
* continue the pipeline when an I/O was not issued. Since the I/O
|
|
|
|
* that was sent to the vdev layer might be different than the one
|
|
|
|
* currently active in the pipeline (see vdev_queue_io()), we explicitly
|
|
|
|
* force the underlying vdev layers to call either zio_execute() or
|
|
|
|
* zio_interrupt() to ensure that the pipeline continues with the correct I/O.
|
|
|
|
*/
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_vdev_io_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
uint64_t align;
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
zio->io_delay = 0;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd == NULL) {
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
|
|
spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* The mirror_ops handle multiple DVAs in a single BP.
|
|
|
|
*/
|
2014-10-21 02:07:45 +04:00
|
|
|
vdev_mirror_ops.vdev_op_io_start(zio);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3P(zio->io_logical, !=, zio);
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE) {
|
|
|
|
ASSERT(spa->spa_trust_config);
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
/*
|
|
|
|
* Note: the code can handle other kinds of writes,
|
|
|
|
* but we don't expect them.
|
|
|
|
*/
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
if (zio->io_vd->vdev_removing) {
|
|
|
|
ASSERT(zio->io_flags &
|
|
|
|
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
|
|
|
|
ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
|
|
|
|
}
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
align = 1ULL << vd->vdev_top->vdev_ashift;
|
|
|
|
|
2014-09-23 03:42:03 +04:00
|
|
|
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
|
|
|
|
P2PHASE(zio->io_size, align) != 0) {
|
|
|
|
/* Transform logical writes to be a full physical block size. */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t asize = P2ROUNDUP(zio->io_size, align);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
|
2012-10-25 02:22:31 +04:00
|
|
|
ASSERT(vd == vd->vdev_top);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE) {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_copy(abuf, zio->io_abd, zio->io_size);
|
|
|
|
abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-09-23 03:42:03 +04:00
|
|
|
/*
|
|
|
|
* If this is not a physical io, make sure that it is properly aligned
|
|
|
|
* before proceeding.
|
|
|
|
*/
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
|
|
|
|
ASSERT0(P2PHASE(zio->io_offset, align));
|
|
|
|
ASSERT0(P2PHASE(zio->io_size, align));
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For physical writes, we allow 512b aligned writes and assume
|
|
|
|
* the device will perform a read-modify-write as necessary.
|
|
|
|
*/
|
|
|
|
ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
|
|
|
|
ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
|
2009-01-16 00:59:39 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is a repair I/O, and there's no self-healing involved --
|
|
|
|
* that is, we're just resilvering what we expect to resilver --
|
|
|
|
* then don't do the I/O unless zio's txg is actually in vd's DTL.
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
* This prevents spurious resilvering.
|
|
|
|
*
|
|
|
|
* There are a few ways that we can end up creating these spurious
|
|
|
|
* resilver i/os:
|
|
|
|
*
|
|
|
|
* 1. A resilver i/o will be issued if any DVA in the BP has a
|
|
|
|
* dirty DTL. The mirror code will issue resilver writes to
|
|
|
|
* each DVA, including the one(s) that are not on vdevs with dirty
|
|
|
|
* DTLs.
|
|
|
|
*
|
|
|
|
* 2. With nested replication, which happens when we have a
|
|
|
|
* "replacing" or "spare" vdev that's a child of a mirror or raidz.
|
|
|
|
* For example, given mirror(replacing(A+B), C), it's likely that
|
|
|
|
* only A is out of date (it's the new device). In this case, we'll
|
|
|
|
* read from C, then use the data to resilver A+B -- but we don't
|
|
|
|
* actually want to resilver B, just A. The top-level mirror has no
|
|
|
|
* way to know this, so instead we just discard unnecessary repairs
|
|
|
|
* as we work our way down the vdev tree.
|
|
|
|
*
|
|
|
|
* 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
|
|
|
|
* The same logic applies to any form of nested replication: ditto
|
|
|
|
* + mirror, RAID-Z + replacing, etc.
|
|
|
|
*
|
|
|
|
* However, indirect vdevs point off to other vdevs which may have
|
|
|
|
* DTL's, so we never bypass them. The child i/os on concrete vdevs
|
|
|
|
* will be properly bypassed instead.
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
*
|
|
|
|
* Leaf DTL_PARTIAL can be empty when a legitimate write comes from
|
|
|
|
* a dRAID spare vdev. For example, when a dRAID spare is first
|
|
|
|
* used, its spare blocks need to be written to but the leaf vdev's
|
|
|
|
* of such blocks can have empty DTL_PARTIAL.
|
|
|
|
*
|
|
|
|
* There seemed no clean way to allow such writes while bypassing
|
|
|
|
* spurious ones. At this point, just avoid all bypassing for dRAID
|
|
|
|
* for correctness.
|
2009-01-16 00:59:39 +03:00
|
|
|
*/
|
|
|
|
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
|
|
|
|
zio->io_txg != 0 && /* not a delegated i/o */
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
vd->vdev_ops != &vdev_indirect_ops &&
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
vd->vdev_top->vdev_ops != &vdev_draid_ops &&
|
2009-01-16 00:59:39 +03:00
|
|
|
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
zio_vdev_io_bypass(zio);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2009-01-16 00:59:39 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
/*
|
|
|
|
* Select the next best leaf I/O to process. Distributed spares are
|
|
|
|
* excluded since they dispatch the I/O directly to a leaf vdev after
|
|
|
|
* applying the dRAID mapping.
|
|
|
|
*/
|
|
|
|
if (vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
vd->vdev_ops != &vdev_draid_spare_ops &&
|
|
|
|
(zio->io_type == ZIO_TYPE_READ ||
|
|
|
|
zio->io_type == ZIO_TYPE_WRITE ||
|
|
|
|
zio->io_type == ZIO_TYPE_TRIM)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zio = vdev_queue_io(zio)) == NULL)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (!vdev_accessible(vd, zio)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_interrupt(zio);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2017-08-02 19:08:38 +03:00
|
|
|
zio->io_delay = gethrtime();
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2014-10-21 02:07:45 +04:00
|
|
|
vd->vdev_ops->vdev_op_io_start(zio);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_vdev_io_done(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
|
|
|
|
boolean_t unexpected_error = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_READ ||
|
|
|
|
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
if (zio->io_delay)
|
|
|
|
zio->io_delay = gethrtime() - zio->io_delay;
|
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
vd->vdev_ops != &vdev_draid_spare_ops) {
|
2008-12-03 23:09:06 +03:00
|
|
|
vdev_queue_io_done(zio);
|
|
|
|
|
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE)
|
|
|
|
vdev_cache_write(zio);
|
|
|
|
|
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
2017-08-15 01:17:15 +03:00
|
|
|
zio->io_error = zio_handle_device_injections(vd, zio,
|
|
|
|
EIO, EILSEQ);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
|
|
|
zio->io_error = zio_handle_label_injection(zio, EIO);
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
|
2008-12-03 23:09:06 +03:00
|
|
|
if (!vdev_accessible(vd, zio)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
unexpected_error = B_TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ops->vdev_op_io_done(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-07-18 21:43:55 +03:00
|
|
|
if (unexpected_error)
|
2009-02-18 23:51:31 +03:00
|
|
|
VERIFY(vdev_probe(vd, zio) == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-12-21 20:13:06 +03:00
|
|
|
/*
|
|
|
|
* This function is used to change the priority of an existing zio that is
|
|
|
|
* currently in-flight. This is used by the arc to upgrade priority in the
|
|
|
|
* event that a demand read is made for a block that is currently queued
|
|
|
|
* as a scrub or async read IO. Otherwise, the high priority read request
|
|
|
|
* would end up having to wait for the lower priority IO.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_change_priority(zio_t *pio, zio_priority_t priority)
|
|
|
|
{
|
|
|
|
zio_t *cio, *cio_next;
|
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
|
|
|
|
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
|
|
|
|
|
|
|
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
vdev_queue_change_io_priority(pio, priority);
|
|
|
|
} else {
|
|
|
|
pio->io_priority = priority;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio, &zl);
|
|
|
|
zio_change_priority(cio, priority);
|
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* For non-raidz ZIOs, we can just copy aside the bad data read from the
|
|
|
|
* disk, and use that to finish the checksum ereport later.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
|
2017-01-05 22:10:07 +03:00
|
|
|
const abd_t *good_buf)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
/* no processing needed */
|
|
|
|
zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
void
|
Clean up RAIDZ/DRAID ereport code
The RAIDZ and DRAID code is responsible for reporting checksum errors on
their child vdevs. Checksum errors represent events where a disk
returned data or parity that should have been correct, but was not. In
other words, these are instances of silent data corruption. The
checksum errors show up in the vdev stats (and thus `zpool status`'s
CKSUM column), and in the event log (`zpool events`).
Note, this is in contrast with the more common "noisy" errors where a
disk goes offline, in which case ZFS knows that the disk is bad and
doesn't try to read it, or the device returns an error on the requested
read or write operation.
RAIDZ/DRAID generate checksum errors via three code paths:
1. When RAIDZ/DRAID reconstructs a damaged block, checksum errors are
reported on any children whose data was not used during the
reconstruction. This is handled in `raidz_reconstruct()`. This is the
most common type of RAIDZ/DRAID checksum error.
2. When RAIDZ/DRAID is not able to reconstruct a damaged block, that
means that the data has been lost. The zio fails and an error is
returned to the consumer (e.g. the read(2) system call). This would
happen if, for example, three different disks in a RAIDZ2 group are
silently damaged. Since the damage is silent, it isn't possible to know
which three disks are damaged, so a checksum error is reported against
every child that returned data or parity for this read. (For DRAID,
typically only one "group" of children is involved in each io.) This
case is handled in `vdev_raidz_cksum_finish()`. This is the next most
common type of RAIDZ/DRAID checksum error.
3. If RAIDZ/DRAID is not able to reconstruct a damaged block (like in
case 2), but there happens to be additional copies of this block due to
"ditto blocks" (i.e. multiple DVA's in this blkptr_t), and one of those
copies is good, then RAIDZ/DRAID compares each sector of the data or
parity that it retrieved with the good data from the other DVA, and if
they differ then it reports a checksum error on this child. This
differs from case 2 in that the checksum error is reported on only the
subset of children that actually have bad data or parity. This case
happens very rarely, since normally only metadata has ditto blocks. If
the silent damage is extensive, there will be many instances of case 2,
and the pool will likely be unrecoverable.
The code for handling case 3 is considerably more complicated than the
other cases, for two reasons:
1. It needs to run after the main raidz read logic has completed. The
data RAIDZ read needs to be preserved until after the alternate DVA has
been read, which necessitates refcounts and callbacks managed by the
non-raidz-specific zio layer.
2. It's nontrivial to map the sections of data read by RAIDZ to the
correct data. For example, the correct data does not include the parity
information, so the parity must be recalculated based on the correct
data, and then compared to the parity that was read from the RAIDZ
children.
Due to the complexity of case 3, the rareness of hitting it, and the
minimal benefit it provides above case 2, this commit removes the code
for case 3. These types of errors will now be handled the same as case
2, i.e. the checksum error will be reported against all children that
returned data or parity.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11735
2021-03-20 02:22:10 +03:00
|
|
|
zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
2017-01-05 22:10:07 +03:00
|
|
|
void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-01-05 22:10:07 +03:00
|
|
|
abd_copy(abd, zio->io_abd, zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
zcr->zcr_cbinfo = zio->io_size;
|
2017-01-05 22:10:07 +03:00
|
|
|
zcr->zcr_cbdata = abd;
|
2010-05-29 00:45:14 +04:00
|
|
|
zcr->zcr_finish = zio_vsd_default_cksum_finish;
|
2017-01-05 22:10:07 +03:00
|
|
|
zcr->zcr_free = zio_abd_free;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_vdev_io_assess(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *vd = zio->io_vd;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
|
|
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
|
|
|
|
|
|
|
|
if (zio->io_vsd != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_vsd_ops->vsd_free(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_vsd = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = zio_handle_fault_injection(zio, EIO);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the I/O failed, determine whether we should attempt to retry it.
|
2010-05-29 00:45:14 +04:00
|
|
|
*
|
|
|
|
* On retry, we cut in line in the issue queue, since we don't want
|
|
|
|
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error && vd == NULL &&
|
|
|
|
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_IO_RETRY |
|
|
|
|
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
|
|
|
|
zio_requeue_io_start_cut_in_line);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If we got an error on a leaf device, convert it to ENXIO
|
|
|
|
* if the device is not accessible at all.
|
|
|
|
*/
|
|
|
|
if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
!vdev_accessible(vd, zio))
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we can't write to an interior vdev (mirror or RAID-Z),
|
|
|
|
* set vdev_cant_write so that we stop trying to allocate from it.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
|
2013-09-04 16:00:57 +04:00
|
|
|
vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
|
2021-03-26 21:19:35 +03:00
|
|
|
vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting "
|
|
|
|
"cant_write=TRUE due to write failure with ENXIO",
|
|
|
|
zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
vd->vdev_cant_write = B_TRUE;
|
2013-09-04 16:00:57 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2017-02-04 20:23:50 +03:00
|
|
|
/*
|
|
|
|
* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
|
2019-03-29 19:13:20 +03:00
|
|
|
* attempts will ever succeed. In this case we set a persistent
|
|
|
|
* boolean flag so that we don't bother with it in the future.
|
2017-02-04 20:23:50 +03:00
|
|
|
*/
|
|
|
|
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
|
|
|
|
zio->io_type == ZIO_TYPE_IOCTL &&
|
|
|
|
zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
|
|
|
|
vd->vdev_nowritecache = B_TRUE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error)
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
zio->io_physdone != NULL) {
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
|
|
|
|
zio->io_physdone(zio->io_logical);
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_reissue(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
|
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_redone(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_bypass(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
|
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
|
|
|
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Encrypt and store encryption parameters
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
|
|
|
|
* managing the storage of encryption parameters and passing them to the
|
|
|
|
* lower-level encryption functions.
|
|
|
|
*/
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_encrypt(zio_t *zio)
|
|
|
|
{
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t psize = BP_GET_PSIZE(bp);
|
2017-11-08 22:12:59 +03:00
|
|
|
uint64_t dsobj = zio->io_bookmark.zb_objset;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_object_type_t ot = BP_GET_TYPE(bp);
|
|
|
|
void *enc_buf = NULL;
|
|
|
|
abd_t *eabd = NULL;
|
|
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
boolean_t no_crypt = B_FALSE;
|
|
|
|
|
|
|
|
/* the root zio already encrypted the data */
|
|
|
|
if (zio->io_child_type == ZIO_CHILD_GANG)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/* only ZIL blocks are re-encrypted on rewrite */
|
|
|
|
if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
|
|
|
|
BP_SET_CRYPT(bp, B_FALSE);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* if we are doing raw encryption set the provided encryption params */
|
|
|
|
if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
|
2017-11-08 22:12:59 +03:00
|
|
|
ASSERT0(BP_GET_LEVEL(bp));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
BP_SET_BYTEORDER(bp, zp->zp_byteorder);
|
|
|
|
if (ot != DMU_OT_OBJSET)
|
|
|
|
zio_crypt_encode_mac_bp(bp, zp->zp_mac);
|
2017-11-08 22:12:59 +03:00
|
|
|
|
|
|
|
/* dnode blocks must be written out in the provided byteorder */
|
|
|
|
if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
|
|
|
|
ot == DMU_OT_DNODE) {
|
|
|
|
void *bswap_buf = zio_buf_alloc(psize);
|
|
|
|
abd_t *babd = abd_get_from_buf(bswap_buf, psize);
|
|
|
|
|
|
|
|
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
|
|
|
|
abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
|
|
|
|
dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
|
|
|
|
psize);
|
|
|
|
|
|
|
|
abd_take_ownership_of_buf(babd, B_TRUE);
|
|
|
|
zio_push_transform(zio, babd, psize, psize, NULL);
|
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (DMU_OT_IS_ENCRYPTED(ot))
|
|
|
|
zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* indirect blocks only maintain a cksum of the lower level MACs */
|
|
|
|
if (BP_GET_LEVEL(bp) > 0) {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
|
|
|
|
zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
|
|
|
|
mac));
|
|
|
|
zio_crypt_encode_mac_bp(bp, mac);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Objset blocks are a special case since they have 2 256-bit MACs
|
|
|
|
* embedded within them.
|
|
|
|
*/
|
|
|
|
if (ot == DMU_OT_OBJSET) {
|
|
|
|
ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
|
|
|
|
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
2017-11-08 22:12:59 +03:00
|
|
|
VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
|
|
|
|
zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* unencrypted object types are only authenticated with a MAC */
|
|
|
|
if (!DMU_OT_IS_ENCRYPTED(ot)) {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
2017-11-08 22:12:59 +03:00
|
|
|
VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
|
|
|
|
zio->io_abd, psize, mac));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_crypt_encode_mac_bp(bp, mac);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Later passes of sync-to-convergence may decide to rewrite data
|
|
|
|
* in place to avoid more disk reallocations. This presents a problem
|
2019-01-13 21:11:52 +03:00
|
|
|
* for encryption because this constitutes rewriting the new data with
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* the same encryption key and IV. However, this only applies to blocks
|
|
|
|
* in the MOS (particularly the spacemaps) and we do not encrypt the
|
|
|
|
* MOS. We assert that the zio is allocating or an intent log write
|
|
|
|
* to enforce this.
|
|
|
|
*/
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
|
|
|
|
ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
|
|
|
|
ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
|
|
|
|
ASSERT3U(psize, !=, 0);
|
|
|
|
|
|
|
|
enc_buf = zio_buf_alloc(psize);
|
|
|
|
eabd = abd_get_from_buf(enc_buf, psize);
|
|
|
|
abd_take_ownership_of_buf(eabd, B_TRUE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For an explanation of what encryption parameters are stored
|
|
|
|
* where, see the block comment in zio_crypt.c.
|
|
|
|
*/
|
|
|
|
if (ot == DMU_OT_INTENT_LOG) {
|
|
|
|
zio_crypt_decode_params_bp(bp, salt, iv);
|
|
|
|
} else {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Perform the encryption. This should not fail */
|
2018-05-03 01:36:20 +03:00
|
|
|
VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
|
|
|
|
BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
|
|
|
|
salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/* encode encryption metadata into the bp */
|
|
|
|
if (ot == DMU_OT_INTENT_LOG) {
|
|
|
|
/*
|
|
|
|
* ZIL blocks store the MAC in the embedded checksum, so the
|
|
|
|
* transform must always be applied.
|
|
|
|
*/
|
|
|
|
zio_crypt_encode_mac_zil(enc_buf, mac);
|
|
|
|
zio_push_transform(zio, eabd, psize, psize, NULL);
|
|
|
|
} else {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
zio_crypt_encode_params_bp(bp, salt, iv);
|
|
|
|
zio_crypt_encode_mac_bp(bp, mac);
|
|
|
|
|
|
|
|
if (no_crypt) {
|
|
|
|
ASSERT3U(ot, ==, DMU_OT_DNODE);
|
|
|
|
abd_free(eabd);
|
|
|
|
} else {
|
|
|
|
zio_push_transform(zio, eabd, psize, psize, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Generate and verify checksums
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_checksum_generate(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-12-03 23:09:06 +03:00
|
|
|
enum zio_checksum checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp == NULL) {
|
|
|
|
/*
|
|
|
|
* This is zio_write_phys().
|
|
|
|
* We're either generating a label checksum, or none at all.
|
|
|
|
*/
|
|
|
|
checksum = zio->io_prop.zp_checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (checksum == ZIO_CHECKSUM_OFF)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
ASSERT(checksum == ZIO_CHECKSUM_LABEL);
|
|
|
|
} else {
|
|
|
|
if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
|
|
|
|
ASSERT(!IO_IS_ALLOCATING(zio));
|
|
|
|
checksum = ZIO_CHECKSUM_GANG_HEADER;
|
|
|
|
} else {
|
|
|
|
checksum = BP_GET_CHECKSUM(bp);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_verify(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_bad_cksum_t info;
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_vd != NULL);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp == NULL) {
|
|
|
|
/*
|
|
|
|
* This is zio_read_phys().
|
|
|
|
* We're either verifying a label checksum, or nothing at all.
|
|
|
|
*/
|
|
|
|
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((error = zio_checksum_error(zio, &info)) != 0) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_error = error;
|
2014-11-23 06:24:52 +03:00
|
|
|
if (error == ECKSUM &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
2021-02-20 09:33:15 +03:00
|
|
|
(void) zfs_ereport_start_checksum(zio->io_spa,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio->io_vd, &zio->io_bookmark, zio,
|
Clean up RAIDZ/DRAID ereport code
The RAIDZ and DRAID code is responsible for reporting checksum errors on
their child vdevs. Checksum errors represent events where a disk
returned data or parity that should have been correct, but was not. In
other words, these are instances of silent data corruption. The
checksum errors show up in the vdev stats (and thus `zpool status`'s
CKSUM column), and in the event log (`zpool events`).
Note, this is in contrast with the more common "noisy" errors where a
disk goes offline, in which case ZFS knows that the disk is bad and
doesn't try to read it, or the device returns an error on the requested
read or write operation.
RAIDZ/DRAID generate checksum errors via three code paths:
1. When RAIDZ/DRAID reconstructs a damaged block, checksum errors are
reported on any children whose data was not used during the
reconstruction. This is handled in `raidz_reconstruct()`. This is the
most common type of RAIDZ/DRAID checksum error.
2. When RAIDZ/DRAID is not able to reconstruct a damaged block, that
means that the data has been lost. The zio fails and an error is
returned to the consumer (e.g. the read(2) system call). This would
happen if, for example, three different disks in a RAIDZ2 group are
silently damaged. Since the damage is silent, it isn't possible to know
which three disks are damaged, so a checksum error is reported against
every child that returned data or parity for this read. (For DRAID,
typically only one "group" of children is involved in each io.) This
case is handled in `vdev_raidz_cksum_finish()`. This is the next most
common type of RAIDZ/DRAID checksum error.
3. If RAIDZ/DRAID is not able to reconstruct a damaged block (like in
case 2), but there happens to be additional copies of this block due to
"ditto blocks" (i.e. multiple DVA's in this blkptr_t), and one of those
copies is good, then RAIDZ/DRAID compares each sector of the data or
parity that it retrieved with the good data from the other DVA, and if
they differ then it reports a checksum error on this child. This
differs from case 2 in that the checksum error is reported on only the
subset of children that actually have bad data or parity. This case
happens very rarely, since normally only metadata has ditto blocks. If
the silent damage is extensive, there will be many instances of case 2,
and the pool will likely be unrecoverable.
The code for handling case 3 is considerably more complicated than the
other cases, for two reasons:
1. It needs to run after the main raidz read logic has completed. The
data RAIDZ read needs to be preserved until after the alternate DVA has
been read, which necessitates refcounts and callbacks managed by the
non-raidz-specific zio layer.
2. It's nontrivial to map the sections of data read by RAIDZ to the
correct data. For example, the correct data does not include the parity
information, so the parity must be recalculated based on the correct
data, and then compared to the parity that was read from the RAIDZ
children.
Due to the complexity of case 3, the rareness of hitting it, and the
minimal benefit it provides above case 2, this commit removes the code
for case 3. These types of errors will now be handled the same as case
2, i.e. the checksum error will be reported against all children that
returned data or parity.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11735
2021-03-20 02:22:10 +03:00
|
|
|
zio->io_offset, zio->io_size, &info);
|
2021-02-20 09:33:15 +03:00
|
|
|
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
|
|
|
zio->io_vd->vdev_stat.vs_checksum_errors++;
|
|
|
|
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called by RAID-Z to ensure we don't compute the checksum twice.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_checksum_verified(zio_t *zio)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
|
2014-06-06 01:19:08 +04:00
|
|
|
* An error of 0 indicates success. ENXIO indicates whole-device failure,
|
2019-01-13 21:11:52 +03:00
|
|
|
* which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO
|
2008-12-03 23:09:06 +03:00
|
|
|
* indicate errors that are specific to one I/O, and most likely permanent.
|
|
|
|
* Any other error is presumed to be worse because we weren't expecting it.
|
|
|
|
* ==========================================================================
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
int
|
|
|
|
zio_worst_error(int e1, int e2)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
|
|
|
|
int r1, r2;
|
|
|
|
|
|
|
|
for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
|
|
|
|
if (e1 == zio_error_rank[r1])
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
|
|
|
|
if (e2 == zio_error_rank[r2])
|
|
|
|
break;
|
|
|
|
|
|
|
|
return (r1 > r2 ? e1 : e2);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* I/O completion
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_ready(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio, *pio_next;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
|
|
|
|
ZIO_WAIT_READY)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_ready) {
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
2013-05-10 23:47:54 +04:00
|
|
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
|
|
|
|
(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_ready(zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp != NULL && bp != &zio->io_bp_copy)
|
|
|
|
zio->io_bp_copy = *bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio->io_error != 0) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
2018-09-06 04:33:36 +03:00
|
|
|
ASSERT(zio->io_metaslab_class != NULL);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* We were unable to allocate anything, unreserve and
|
|
|
|
* issue the next I/O to allocate.
|
|
|
|
*/
|
|
|
|
metaslab_class_throttle_unreserve(
|
2018-09-06 04:33:36 +03:00
|
|
|
zio->io_metaslab_class, zio->io_prop.zp_copies,
|
|
|
|
zio->io_allocator, zio);
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_READY] = 1;
|
2016-10-14 03:59:18 +03:00
|
|
|
pio = zio_walk_parents(zio, &zl);
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As we notify zio's parents, new parents could be added.
|
|
|
|
* New parents go to the head of zio's io_parent_list, however,
|
|
|
|
* so we will (correctly) not notify them. The remainder of zio's
|
|
|
|
* io_parent_list, from 'pio_next' onward, cannot change because
|
|
|
|
* all parents must wait for us to be done before they can be done.
|
|
|
|
*/
|
|
|
|
for (; pio != NULL; pio = pio_next) {
|
2016-10-14 03:59:18 +03:00
|
|
|
pio_next = zio_walk_parents(zio, &zl);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NODATA) {
|
|
|
|
if (BP_IS_GANG(bp)) {
|
|
|
|
zio->io_flags &= ~ZIO_FLAG_NODATA;
|
|
|
|
} else {
|
2016-07-22 18:52:49 +03:00
|
|
|
ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zio_injection_enabled &&
|
|
|
|
zio->io_spa->spa_syncing_txg == zio->io_txg)
|
|
|
|
zio_handle_ignored_writes(zio);
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Update the allocation throttle accounting.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_dva_throttle_done(zio_t *zio)
|
|
|
|
{
|
2019-12-05 23:37:00 +03:00
|
|
|
zio_t *lio __maybe_unused = zio->io_logical;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
int flags = METASLAB_ASYNC_ALLOC;
|
|
|
|
|
|
|
|
ASSERT3P(zio->io_bp, !=, NULL);
|
|
|
|
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
|
|
|
|
ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
|
|
|
|
ASSERT(vd != NULL);
|
|
|
|
ASSERT3P(vd, ==, vd->vdev_top);
|
2017-08-11 01:53:40 +03:00
|
|
|
ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
|
|
|
|
ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parents of gang children can have two flavors -- ones that
|
|
|
|
* allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
|
|
|
|
* and ones that allocated the constituent blocks. The allocation
|
|
|
|
* throttle needs to know the allocating parent zio so we must find
|
|
|
|
* it here.
|
|
|
|
*/
|
|
|
|
if (pio->io_child_type == ZIO_CHILD_GANG) {
|
|
|
|
/*
|
|
|
|
* If our parent is a rewrite gang child then our grandparent
|
|
|
|
* would have been the one that performed the allocation.
|
|
|
|
*/
|
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
|
|
|
|
pio = zio_unique_parent(pio);
|
|
|
|
flags |= METASLAB_GANG_CHILD;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(IO_IS_ALLOCATING(pio));
|
|
|
|
ASSERT3P(zio, !=, zio->io_logical);
|
|
|
|
ASSERT(zio->io_logical != NULL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
|
|
|
|
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
|
2018-09-06 04:33:36 +03:00
|
|
|
ASSERT(zio->io_metaslab_class != NULL);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
|
|
|
|
pio->io_allocator, B_TRUE);
|
2016-10-14 03:59:18 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
|
|
|
|
pio->io_allocator, pio);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Call into the pipeline to see if there is more work that
|
|
|
|
* needs to be done. If there is work to be done it will be
|
|
|
|
* dispatched to another taskq thread.
|
|
|
|
*/
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
static zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_done(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Always attempt to keep stack usage minimal here since
|
2019-01-13 21:11:52 +03:00
|
|
|
* we can be called recursively up to 19 levels deep.
|
2016-10-14 03:59:18 +03:00
|
|
|
*/
|
2017-01-05 22:10:07 +03:00
|
|
|
const uint64_t psize = zio->io_size;
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio, *pio_next;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* If our children haven't all completed,
|
2008-12-03 23:09:06 +03:00
|
|
|
* wait for them and then repeat this pipeline stage.
|
|
|
|
*/
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* If the allocation throttle is enabled, then update the accounting.
|
|
|
|
* We only track child I/Os that are part of an allocating async
|
|
|
|
* write. We must do this since the allocation is performed
|
|
|
|
* by the logical I/O but the actual write is done by child I/Os.
|
|
|
|
*/
|
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
|
|
|
|
zio->io_child_type == ZIO_CHILD_VDEV) {
|
2018-09-06 04:33:36 +03:00
|
|
|
ASSERT(zio->io_metaslab_class != NULL);
|
|
|
|
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_dva_throttle_done(zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the allocation throttle is enabled, verify that
|
|
|
|
* we have decremented the refcounts for every I/O that was throttled.
|
|
|
|
*/
|
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(zio->io_bp != NULL);
|
2018-09-06 04:33:36 +03:00
|
|
|
|
OpenZFS 9112 - Improve allocation performance on high-end systems
Overview
========
We parallelize the allocation process by creating the concept of
"allocators". There are a certain number of allocators per metaslab
group, defined by the value of a tunable at pool open time. Each
allocator for a given metaslab group has up to 2 active metaslabs; one
"primary", and one "secondary". The primary and secondary weight mean
the same thing they did in in the pre-allocator world; primary metaslabs
are used for most allocations, secondary metaslabs are used for ditto
blocks being allocated in the same metaslab group. There is also the
CLAIM weight, which has been separated out from the other weights, but
that is less important to understanding the patch. The active metaslabs
for each allocator are moved from their normal place in the metaslab
tree for the group to the back of the tree. This way, they will not be
selected for use by other allocators searching for new metaslabs unless
all the passive metaslabs are unsuitable for allocations. If that does
happen, the allocators will "steal" from each other to ensure that IOs
don't fail until there is truly no space left to perform allocations.
In addition, the alloc queue for each metaslab group has been broken
into a separate queue for each allocator. We don't want to dramatically
increase the number of inflight IOs on low-end systems, because it can
significantly increase txg times. On the other hand, we want to ensure
that there are enough IOs for each allocator to allow for good
coalescing before sending the IOs to the disk. As a result, we take a
compromise path; each allocator's alloc queue max depth starts at a
certain value for every txg. Every time an IO completes, we increase the
max depth. This should hopefully provide a good balance between the two
failure modes, while not dramatically increasing complexity.
We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause
very similar contention when selecting IOs to allocate. This
parallelization uses the same allocator scheme as metaslab selection.
Performance Results
===================
Performance improvements from this change can vary significantly based
on the number of CPUs in the system, whether or not the system has a
NUMA architecture, the speed of the drives, the values for the various
tunables, and the workload being performed. For an fio async sequential
write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB
SSDs, there is a roughly 25% performance improvement.
Future Work
===========
Analysis of the performance of the system with this patch applied shows
that a significant new bottleneck is the vdev disk queues, which also
need to be parallelized. Prototyping of this change has occurred, and
there was a performance improvement, but more work needs to be done
before its stability has been verified and it is ready to be upstreamed.
Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Porting Notes:
* Fix reservation test failures by increasing tolerance.
OpenZFS-issue: https://illumos.org/issues/9112
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3
Closes #7682
2018-02-12 23:56:06 +03:00
|
|
|
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
|
|
|
|
zio->io_allocator);
|
2020-12-15 21:55:44 +03:00
|
|
|
VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
|
|
|
|
mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
|
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_children[c][w] == 0);
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(zio->io_bp->blk_pad[0] == 0);
|
|
|
|
ASSERT(zio->io_bp->blk_pad[1] == 0);
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
|
|
|
|
sizeof (blkptr_t)) == 0 ||
|
2010-08-26 22:04:17 +04:00
|
|
|
(zio->io_bp == zio_unique_parent(zio)->io_bp));
|
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_bp_override == NULL &&
|
2008-12-03 23:09:06 +03:00
|
|
|
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=,
|
|
|
|
BP_GET_NDVAS(zio->io_bp));
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
|
2013-11-01 23:26:11 +04:00
|
|
|
(BP_COUNT_GANG(zio->io_bp) ==
|
|
|
|
BP_GET_NDVAS(zio->io_bp)));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2013-05-10 23:47:54 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NOPWRITE)
|
|
|
|
VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* If there were child vdev/gang/ddt errors, they apply to us now.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
|
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the I/O on the transformed data was successful, generate any
|
|
|
|
* checksum reports now while we still have the transformed data.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == 0) {
|
|
|
|
while (zio->io_cksum_report != NULL) {
|
|
|
|
zio_cksum_report_t *zcr = zio->io_cksum_report;
|
|
|
|
uint64_t align = zcr->zcr_align;
|
2016-07-22 18:52:49 +03:00
|
|
|
uint64_t asize = P2ROUNDUP(psize, align);
|
|
|
|
abd_t *adata = zio->io_abd;
|
|
|
|
|
2021-04-16 21:00:53 +03:00
|
|
|
if (adata != NULL && asize != psize) {
|
2017-01-05 22:10:07 +03:00
|
|
|
adata = abd_alloc(asize, B_TRUE);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_copy(adata, zio->io_abd, psize);
|
|
|
|
abd_zero_off(adata, psize, asize - psize);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
zio->io_cksum_report = zcr->zcr_next;
|
|
|
|
zcr->zcr_next = NULL;
|
2017-01-05 22:10:07 +03:00
|
|
|
zcr->zcr_finish(zcr, adata);
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_ereport_free_checksum(zcr);
|
|
|
|
|
2021-04-16 21:00:53 +03:00
|
|
|
if (adata != NULL && asize != psize)
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(adata);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio_pop_transforms(zio); /* note: may set zio->io_error */
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
vdev_stat_update(zio, psize);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-10-02 03:54:52 +04:00
|
|
|
/*
|
2013-04-30 02:49:23 +04:00
|
|
|
* If this I/O is attached to a particular vdev is slow, exceeding
|
2012-12-21 06:15:34 +04:00
|
|
|
* 30 seconds to complete, post an error described the I/O delay.
|
|
|
|
* We ignore these errors if the device is currently unavailable.
|
2010-10-02 03:54:52 +04:00
|
|
|
*/
|
2018-11-09 03:47:24 +03:00
|
|
|
if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
|
|
|
|
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
|
|
|
|
/*
|
|
|
|
* We want to only increment our slow IO counters if
|
|
|
|
* the IO is valid (i.e. not if the drive is removed).
|
|
|
|
*
|
|
|
|
* zfs_ereport_post() will also do these checks, but
|
|
|
|
* it can also ratelimit and have other failures, so we
|
|
|
|
* need to increment the slow_io counters independent
|
|
|
|
* of it.
|
|
|
|
*/
|
|
|
|
if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
|
|
|
|
zio->io_spa, zio->io_vd, zio)) {
|
|
|
|
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
|
|
|
zio->io_vd->vdev_stat.vs_slow_ios++;
|
|
|
|
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
|
|
|
|
2020-09-01 05:35:11 +03:00
|
|
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
|
2018-11-09 03:47:24 +03:00
|
|
|
zio->io_spa, zio->io_vd, &zio->io_bookmark,
|
2020-09-04 20:34:28 +03:00
|
|
|
zio, 0);
|
2018-11-09 03:47:24 +03:00
|
|
|
}
|
|
|
|
}
|
2012-12-21 06:15:34 +04:00
|
|
|
}
|
2010-10-02 03:54:52 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error) {
|
|
|
|
/*
|
|
|
|
* If this I/O is attached to a particular vdev,
|
|
|
|
* generate an error message describing the I/O failure
|
|
|
|
* at the block level. We ignore these errors if the
|
|
|
|
* device is currently unavailable.
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
|
2019-03-15 04:21:53 +03:00
|
|
|
!vdev_is_dead(zio->io_vd)) {
|
2020-09-04 20:34:28 +03:00
|
|
|
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
|
|
|
|
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
|
|
|
|
if (ret != EALREADY) {
|
|
|
|
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
|
|
|
if (zio->io_type == ZIO_TYPE_READ)
|
|
|
|
zio->io_vd->vdev_stat.vs_read_errors++;
|
|
|
|
else if (zio->io_type == ZIO_TYPE_WRITE)
|
|
|
|
zio->io_vd->vdev_stat.vs_write_errors++;
|
|
|
|
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
2019-03-15 04:21:53 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zio->io_error == EIO || !(zio->io_flags &
|
|
|
|
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
|
2010-08-26 22:04:17 +04:00
|
|
|
zio == zio->io_logical) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* For logical I/O requests, tell the SPA to log the
|
|
|
|
* error and generate a logical data ereport.
|
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
spa_log_error(zio->io_spa, &zio->io_bookmark);
|
2020-09-01 05:35:11 +03:00
|
|
|
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
|
2020-09-04 20:34:28 +03:00
|
|
|
zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:04:17 +04:00
|
|
|
if (zio->io_error && zio == zio->io_logical) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Determine whether zio should be reexecuted. This will
|
|
|
|
* propagate all the way to the root via zio_notify_parent().
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (IO_IS_ALLOCATING(zio) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error != ENOSPC)
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
|
|
|
|
else
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zio->io_type == ZIO_TYPE_READ ||
|
|
|
|
zio->io_type == ZIO_TYPE_FREE) &&
|
2010-08-27 01:24:34 +04:00
|
|
|
!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_error == ENXIO &&
|
2010-08-26 22:04:17 +04:00
|
|
|
spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
|
|
|
|
spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
|
|
|
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here is a possibly good place to attempt to do
|
|
|
|
* either combinatorial reconstruction or error correction
|
|
|
|
* based on checksums. It also might be a good place
|
|
|
|
* to send out preliminary ereports before we suspend
|
|
|
|
* processing.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* If there were logical child errors, they apply to us now.
|
|
|
|
* We defer this until now to avoid conflating logical child
|
|
|
|
* errors with errors that happened to the zio itself when
|
|
|
|
* updating vdev stats and reporting FMA events above.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zio->io_error || zio->io_reexecute) &&
|
|
|
|
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
|
2013-05-10 23:47:54 +04:00
|
|
|
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
|
2010-08-26 22:04:17 +04:00
|
|
|
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
zio_gang_tree_free(&zio->io_gang_tree);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Godfather I/Os should never suspend.
|
|
|
|
*/
|
|
|
|
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
|
|
|
|
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
|
2017-01-28 23:13:34 +03:00
|
|
|
zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_reexecute) {
|
|
|
|
/*
|
|
|
|
* This is a logical I/O that wants to reexecute.
|
|
|
|
*
|
|
|
|
* Reexecute is top-down. When an i/o fails, if it's not
|
|
|
|
* the root, it simply notifies its parent and sticks around.
|
|
|
|
* The parent, seeing that it still has children in zio_done(),
|
|
|
|
* does the same. This percolates all the way up to the root.
|
|
|
|
* The root i/o will reexecute or suspend the entire tree.
|
|
|
|
*
|
|
|
|
* This approach ensures that zio_reexecute() honors
|
|
|
|
* all the original i/o dependency relationships, e.g.
|
|
|
|
* parents not executing until children are ready.
|
|
|
|
*/
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_gang_leader = NULL;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = 1;
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* "The Godfather" I/O monitors its children but is
|
|
|
|
* not a true parent to them. It will track them through
|
|
|
|
* the pipeline but severs its ties whenever they get into
|
|
|
|
* trouble (e.g. suspended). This allows "The Godfather"
|
|
|
|
* I/O to return status without blocking.
|
|
|
|
*/
|
2016-10-14 03:59:18 +03:00
|
|
|
zl = NULL;
|
|
|
|
for (pio = zio_walk_parents(zio, &zl); pio != NULL;
|
|
|
|
pio = pio_next) {
|
|
|
|
zio_link_t *remove_zl = zl;
|
|
|
|
pio_next = zio_walk_parents(zio, &zl);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
|
|
|
|
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_remove_child(pio, zio, remove_zl);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
/*
|
|
|
|
* This is a rare code path, so we don't
|
|
|
|
* bother with "next_to_execute".
|
|
|
|
*/
|
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
|
|
|
|
NULL);
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
if ((pio = zio_unique_parent(zio)) != NULL) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We're not a root i/o, so there's nothing to do
|
|
|
|
* but notify our parent. Don't propagate errors
|
|
|
|
* upward since we haven't permanently failed yet.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
/*
|
|
|
|
* This is a rare code path, so we don't bother with
|
|
|
|
* "next_to_execute".
|
|
|
|
*/
|
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
|
|
|
|
/*
|
|
|
|
* We'd fail again if we reexecuted now, so suspend
|
|
|
|
* until conditions improve (e.g. device comes online).
|
|
|
|
*/
|
2018-03-15 20:56:55 +03:00
|
|
|
zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Reexecution is potentially a huge amount of work.
|
|
|
|
* Hand it off to the otherwise-unused claim taskq.
|
|
|
|
*/
|
2011-11-08 04:26:52 +04:00
|
|
|
ASSERT(taskq_empty_ent(&zio->io_tqent));
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskq_dispatch_ent(zio->io_spa,
|
|
|
|
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
|
2011-11-08 04:26:52 +04:00
|
|
|
(task_func_t *)zio_reexecute, zio, 0,
|
|
|
|
&zio->io_tqent);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_count == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_reexecute == 0);
|
|
|
|
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Report any checksum errors, since the I/O is complete.
|
|
|
|
*/
|
|
|
|
while (zio->io_cksum_report != NULL) {
|
|
|
|
zio_cksum_report_t *zcr = zio->io_cksum_report;
|
|
|
|
zio->io_cksum_report = zcr->zcr_next;
|
|
|
|
zcr->zcr_next = NULL;
|
|
|
|
zcr->zcr_finish(zcr, NULL);
|
|
|
|
zfs_ereport_free_checksum(zcr);
|
|
|
|
}
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
|
2014-06-06 01:19:08 +04:00
|
|
|
!BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
/*
|
|
|
|
* It is the responsibility of the done callback to ensure that this
|
|
|
|
* particular zio is no longer discoverable for adoption, and as
|
|
|
|
* such, cannot acquire any new parents.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_done)
|
|
|
|
zio->io_done(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = 1;
|
|
|
|
mutex_exit(&zio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
/*
|
|
|
|
* We are done executing this zio. We may want to execute a parent
|
|
|
|
* next. See the comment in zio_notify_parent().
|
|
|
|
*/
|
|
|
|
zio_t *next_to_execute = NULL;
|
2016-10-14 03:59:18 +03:00
|
|
|
zl = NULL;
|
|
|
|
for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
|
|
|
|
zio_link_t *remove_zl = zl;
|
|
|
|
pio_next = zio_walk_parents(zio, &zl);
|
|
|
|
zio_remove_child(pio, zio, remove_zl);
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_waiter != NULL) {
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_executor = NULL;
|
|
|
|
cv_broadcast(&zio->io_cv);
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
} else {
|
|
|
|
zio_destroy(zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Reduce taskq and context-switch cost of zio pipe
When doing a read from disk, ZFS creates 3 ZIO's: a zio_null(), the
logical zio_read(), and then a physical zio. Currently, each of these
results in a separate taskq_dispatch(zio_execute).
On high-read-iops workloads, this causes a significant performance
impact. By processing all 3 ZIO's in a single taskq entry, we reduce the
overhead on taskq locking and context switching. We accomplish this by
allowing zio_done() to return a "next zio to execute" to zio_execute().
This results in a ~12% performance increase for random reads, from
96,000 iops to 108,000 iops (with recordsize=8k, on SSD's).
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-59292
Closes #7736
2018-08-03 01:51:45 +03:00
|
|
|
return (next_to_execute);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
* I/O pipeline definition
|
|
|
|
* ==========================================================================
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static zio_pipe_stage_t *zio_pipeline[] = {
|
2008-12-03 23:09:06 +03:00
|
|
|
NULL,
|
|
|
|
zio_read_bp_init,
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_write_bp_init,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_bp_init,
|
|
|
|
zio_issue_async,
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_write_compress,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_encrypt,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_generate,
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_nop_write,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_read_start,
|
|
|
|
zio_ddt_read_done,
|
|
|
|
zio_ddt_write,
|
|
|
|
zio_ddt_free,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_assemble,
|
|
|
|
zio_gang_issue,
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_dva_throttle,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_dva_allocate,
|
|
|
|
zio_dva_free,
|
|
|
|
zio_dva_claim,
|
|
|
|
zio_ready,
|
|
|
|
zio_vdev_io_start,
|
|
|
|
zio_vdev_io_done,
|
|
|
|
zio_vdev_io_assess,
|
|
|
|
zio_checksum_verify,
|
|
|
|
zio_done
|
|
|
|
};
|
2010-08-26 22:49:16 +04:00
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
/*
|
|
|
|
* Compare two zbookmark_phys_t's to see which we would reach first in a
|
|
|
|
* pre-order traversal of the object tree.
|
|
|
|
*
|
|
|
|
* This is simple in every case aside from the meta-dnode object. For all other
|
|
|
|
* objects, we traverse them in order (object 1 before object 2, and so on).
|
|
|
|
* However, all of these objects are traversed while traversing object 0, since
|
|
|
|
* the data it points to is the list of objects. Thus, we need to convert to a
|
|
|
|
* canonical representation so we can compare meta-dnode bookmarks to
|
|
|
|
* non-meta-dnode bookmarks.
|
|
|
|
*
|
|
|
|
* We do this by calculating "equivalents" for each field of the zbookmark.
|
|
|
|
* zbookmarks outside of the meta-dnode use their own object and level, and
|
|
|
|
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
|
|
|
|
* blocks this bookmark refers to) by multiplying their blkid by their span
|
|
|
|
* (the number of L0 blocks contained within one block at their level).
|
|
|
|
* zbookmarks inside the meta-dnode calculate their object equivalent
|
|
|
|
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
|
|
|
|
* level + 1<<31 (any value larger than a level could ever be) for their level.
|
|
|
|
* This causes them to always compare before a bookmark in their object
|
|
|
|
* equivalent, compare appropriately to bookmarks in other objects, and to
|
|
|
|
* compare appropriately to other bookmarks in the meta-dnode.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
|
|
|
|
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* These variables represent the "equivalent" values for the zbookmark,
|
|
|
|
* after converting zbookmarks inside the meta dnode to their
|
|
|
|
* normal-object equivalents.
|
|
|
|
*/
|
|
|
|
uint64_t zb1obj, zb2obj;
|
|
|
|
uint64_t zb1L0, zb2L0;
|
|
|
|
uint64_t zb1level, zb2level;
|
|
|
|
|
|
|
|
if (zb1->zb_object == zb2->zb_object &&
|
|
|
|
zb1->zb_level == zb2->zb_level &&
|
|
|
|
zb1->zb_blkid == zb2->zb_blkid)
|
|
|
|
return (0);
|
2012-12-14 03:24:15 +04:00
|
|
|
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
|
|
|
|
IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
/*
|
|
|
|
* BP_SPANB calculates the span in blocks.
|
|
|
|
*/
|
|
|
|
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
|
|
|
|
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
|
2012-12-14 03:24:15 +04:00
|
|
|
|
|
|
|
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
2015-12-22 04:31:57 +03:00
|
|
|
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
|
|
|
zb1L0 = 0;
|
|
|
|
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
|
|
|
|
} else {
|
|
|
|
zb1obj = zb1->zb_object;
|
|
|
|
zb1level = zb1->zb_level;
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
|
|
|
|
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
|
|
|
zb2L0 = 0;
|
|
|
|
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
|
|
|
|
} else {
|
|
|
|
zb2obj = zb2->zb_object;
|
|
|
|
zb2level = zb2->zb_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now that we have a canonical representation, do the comparison. */
|
|
|
|
if (zb1obj != zb2obj)
|
|
|
|
return (zb1obj < zb2obj ? -1 : 1);
|
|
|
|
else if (zb1L0 != zb2L0)
|
|
|
|
return (zb1L0 < zb2L0 ? -1 : 1);
|
|
|
|
else if (zb1level != zb2level)
|
|
|
|
return (zb1level > zb2level ? -1 : 1);
|
|
|
|
/*
|
|
|
|
* This can (theoretically) happen if the bookmarks have the same object
|
|
|
|
* and level, but different blkids, if the block sizes are not the same.
|
|
|
|
* There is presently no way to change the indirect block sizes
|
|
|
|
*/
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function checks the following: given that last_block is the place that
|
|
|
|
* our traversal stopped last time, does that guarantee that we've visited
|
|
|
|
* every node under subtree_root? Therefore, we can't just use the raw output
|
|
|
|
* of zbookmark_compare. We have to pass in a modified version of
|
|
|
|
* subtree_root; by incrementing the block id, and then checking whether
|
|
|
|
* last_block is before or equal to that, we can tell whether or not having
|
|
|
|
* visited last_block implies that all of subtree_root's children have been
|
|
|
|
* visited.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zbookmark_subtree_completed(const dnode_phys_t *dnp,
|
|
|
|
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
|
|
|
|
{
|
|
|
|
zbookmark_phys_t mod_zb = *subtree_root;
|
|
|
|
mod_zb.zb_blkid++;
|
|
|
|
ASSERT(last_block->zb_level == 0);
|
|
|
|
|
|
|
|
/* The objset_phys_t isn't before anything. */
|
|
|
|
if (dnp == NULL)
|
2012-12-14 03:24:15 +04:00
|
|
|
return (B_FALSE);
|
2015-12-22 04:31:57 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
|
|
|
|
* data block size in sectors, because that variable is only used if
|
|
|
|
* the bookmark refers to a block in the meta-dnode. Since we don't
|
|
|
|
* know without examining it what object it refers to, and there's no
|
|
|
|
* harm in passing in this value in other cases, we always pass it in.
|
|
|
|
*
|
|
|
|
* We pass in 0 for the indirect block size shift because zb2 must be
|
|
|
|
* level 0. The indirect block size is only used to calculate the span
|
|
|
|
* of the bookmark, but since the bookmark must be level 0, the span is
|
|
|
|
* always 1, so the math works out.
|
|
|
|
*
|
|
|
|
* If you make changes to how the zbookmark_compare code works, be sure
|
|
|
|
* to make sure that this code still works afterwards.
|
|
|
|
*/
|
|
|
|
return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
|
|
|
|
1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
|
|
|
|
last_block) <= 0);
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:49:16 +04:00
|
|
|
EXPORT_SYMBOL(zio_type_name);
|
2014-12-16 22:44:24 +03:00
|
|
|
EXPORT_SYMBOL(zio_buf_alloc);
|
|
|
|
EXPORT_SYMBOL(zio_data_buf_alloc);
|
|
|
|
EXPORT_SYMBOL(zio_buf_free);
|
|
|
|
EXPORT_SYMBOL(zio_data_buf_free);
|
2010-08-26 22:49:16 +04:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
/* BEGIN CSTYLED */
|
|
|
|
ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
|
2018-11-09 03:47:24 +03:00
|
|
|
"Max I/O completion time (milliseconds) before marking it as slow");
|
2011-05-04 02:09:28 +04:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
|
|
|
|
"Prioritize requeued I/O");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Defer frees starting in this pass");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Don't compress starting in this pass");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Rewrite new bps starting in this pass");
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
|
2016-10-14 03:59:18 +03:00
|
|
|
"Throttle block allocations in the ZIO pipeline");
|
2019-02-15 23:44:24 +03:00
|
|
|
|
2019-09-06 00:49:49 +03:00
|
|
|
ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
|
2019-02-15 23:44:24 +03:00
|
|
|
"Log all slow ZIOs, not just those with vdevs");
|
2019-09-06 00:49:49 +03:00
|
|
|
/* END CSTYLED */
|