2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2017-03-21 04:36:00 +03:00
|
|
|
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
2011-11-08 04:26:52 +04:00
|
|
|
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
2014-11-03 23:15:08 +03:00
|
|
|
#include <sys/sysmacros.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/fm/fs/zfs.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/txg.h>
|
|
|
|
#include <sys/spa_impl.h>
|
|
|
|
#include <sys/vdev_impl.h>
|
|
|
|
#include <sys/zio_impl.h>
|
|
|
|
#include <sys/zio_compress.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/dmu_objset.h>
|
|
|
|
#include <sys/arc.h>
|
|
|
|
#include <sys/ddt.h>
|
2014-06-06 01:19:08 +04:00
|
|
|
#include <sys/blkptr.h>
|
2013-12-09 22:37:51 +04:00
|
|
|
#include <sys/zfeature.h>
|
2017-11-16 04:27:01 +03:00
|
|
|
#include <sys/dsl_scan.h>
|
2016-10-14 03:59:18 +03:00
|
|
|
#include <sys/metaslab_impl.h>
|
2016-02-29 21:05:23 +03:00
|
|
|
#include <sys/time.h>
|
2016-05-23 20:41:29 +03:00
|
|
|
#include <sys/trace_zio.h>
|
2016-07-22 18:52:49 +03:00
|
|
|
#include <sys/abd.h>
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#include <sys/dsl_crypt.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O type descriptions
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
const char *zio_type_name[ZIO_TYPES] = {
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Note: Linux kernel thread name length is limited
|
|
|
|
* so these names will differ from upstream open zfs.
|
|
|
|
*/
|
2010-10-28 21:36:50 +04:00
|
|
|
"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
|
2010-05-29 00:45:14 +04:00
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-12-08 23:57:42 +03:00
|
|
|
int zio_dva_throttle_enabled = B_TRUE;
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O kmem caches
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
kmem_cache_t *zio_cache;
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_t *zio_link_cache;
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
|
|
|
|
#endif
|
|
|
|
|
2010-10-02 03:54:52 +04:00
|
|
|
int zio_delay_max = ZIO_DELAY_MAX;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-10-21 02:07:45 +04:00
|
|
|
#define ZIO_PIPELINE_CONTINUE 0x100
|
|
|
|
#define ZIO_PIPELINE_STOP 0x101
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
#define BP_SPANB(indblkshift, level) \
|
|
|
|
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
|
|
|
|
#define COMPARE_META_LEVEL 0x80000000ul
|
2013-05-06 21:14:52 +04:00
|
|
|
/*
|
|
|
|
* The following actions directly effect the spa's sync-to-convergence logic.
|
|
|
|
* The values below define the sync pass when we start performing the action.
|
|
|
|
* Care should be taken when changing these values as they directly impact
|
|
|
|
* spa_sync() performance. Tuning these values may introduce subtle performance
|
|
|
|
* pathologies and should only be done in the context of performance analysis.
|
|
|
|
* These tunables will eventually be removed and replaced with #defines once
|
|
|
|
* enough analysis has been done to determine optimal values.
|
|
|
|
*
|
|
|
|
* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
|
|
|
|
* regular blocks are not deferred.
|
|
|
|
*/
|
|
|
|
int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
|
|
|
|
int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
|
|
|
|
int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* An allocating zio is one that either currently has the DVA allocate
|
|
|
|
* stage set or will have it later in its lifetime.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
|
|
|
|
|
2011-05-04 02:09:28 +04:00
|
|
|
int zio_requeue_io_start_cut_in_line = 1;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
int zio_buf_debug_limit = 16384;
|
|
|
|
#else
|
|
|
|
int zio_buf_debug_limit = 0;
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
static inline void __zio_execute(zio_t *zio);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
|
|
|
zio_init(void)
|
|
|
|
{
|
|
|
|
size_t c;
|
|
|
|
vmem_t *data_alloc_arena = NULL;
|
|
|
|
|
2015-01-30 22:25:19 +03:00
|
|
|
zio_cache = kmem_cache_create("zio_cache",
|
|
|
|
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_link_cache = kmem_cache_create("zio_link_cache",
|
2014-05-15 05:17:39 +04:00
|
|
|
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For small buffers, we want a cache for each multiple of
|
2014-11-03 23:15:08 +03:00
|
|
|
* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
|
|
|
|
* for each quarter-power of 2.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
|
|
|
|
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
|
|
|
|
size_t p2 = size;
|
|
|
|
size_t align = 0;
|
2015-02-07 00:37:02 +03:00
|
|
|
size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-05 07:54:48 +03:00
|
|
|
#if defined(_ILP32) && defined(_KERNEL)
|
2014-11-03 23:15:08 +03:00
|
|
|
/*
|
|
|
|
* Cache size limited to 1M on 32-bit platforms until ARC
|
|
|
|
* buffers no longer require virtual address space.
|
|
|
|
*/
|
|
|
|
if (size > zfs_max_recordsize)
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
while (!ISP2(p2))
|
2008-11-20 23:01:55 +03:00
|
|
|
p2 &= p2 - 1;
|
|
|
|
|
2013-05-17 01:18:06 +04:00
|
|
|
#ifndef _KERNEL
|
|
|
|
/*
|
|
|
|
* If we are using watchpoints, put each buffer on its own page,
|
|
|
|
* to eliminate the performance overhead of trapping to the
|
|
|
|
* kernel when modifying a non-watched buffer that shares the
|
|
|
|
* page with a watched buffer.
|
|
|
|
*/
|
|
|
|
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
|
|
|
|
continue;
|
2016-06-29 23:59:51 +03:00
|
|
|
/*
|
|
|
|
* Here's the problem - on 4K native devices in userland on
|
|
|
|
* Linux using O_DIRECT, buffers must be 4K aligned or I/O
|
|
|
|
* will fail with EINVAL, causing zdb (and others) to coredump.
|
|
|
|
* Since userland probably doesn't need optimized buffer caches,
|
|
|
|
* we just force 4K alignment on everything.
|
|
|
|
*/
|
|
|
|
align = 8 * SPA_MINBLOCKSIZE;
|
|
|
|
#else
|
2017-05-02 20:04:30 +03:00
|
|
|
if (size < PAGESIZE) {
|
2008-11-20 23:01:55 +03:00
|
|
|
align = SPA_MINBLOCKSIZE;
|
2013-05-17 01:18:06 +04:00
|
|
|
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
|
2017-05-02 20:04:30 +03:00
|
|
|
align = PAGESIZE;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2016-06-29 23:59:51 +03:00
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (align != 0) {
|
|
|
|
char name[36];
|
|
|
|
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
|
|
|
|
zio_buf_cache[c] = kmem_cache_create(name, size,
|
2015-02-07 00:37:02 +03:00
|
|
|
align, NULL, NULL, NULL, NULL, NULL, cflags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
|
|
|
|
zio_data_buf_cache[c] = kmem_cache_create(name, size,
|
2011-11-02 03:56:48 +04:00
|
|
|
align, NULL, NULL, NULL, NULL,
|
2015-02-07 00:37:02 +03:00
|
|
|
data_alloc_arena, cflags);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (--c != 0) {
|
|
|
|
ASSERT(zio_buf_cache[c] != NULL);
|
|
|
|
if (zio_buf_cache[c - 1] == NULL)
|
|
|
|
zio_buf_cache[c - 1] = zio_buf_cache[c];
|
|
|
|
|
|
|
|
ASSERT(zio_data_buf_cache[c] != NULL);
|
|
|
|
if (zio_data_buf_cache[c - 1] == NULL)
|
|
|
|
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_inject_init();
|
2013-01-23 13:54:30 +04:00
|
|
|
|
|
|
|
lz4_init();
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_fini(void)
|
|
|
|
{
|
|
|
|
size_t c;
|
|
|
|
kmem_cache_t *last_cache = NULL;
|
|
|
|
kmem_cache_t *last_data_cache = NULL;
|
|
|
|
|
|
|
|
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
|
2014-11-03 23:15:08 +03:00
|
|
|
#ifdef _ILP32
|
|
|
|
/*
|
|
|
|
* Cache size limited to 1M on 32-bit platforms until ARC
|
|
|
|
* buffers no longer require virtual address space.
|
|
|
|
*/
|
|
|
|
if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
|
|
|
|
break;
|
2016-07-22 18:52:49 +03:00
|
|
|
#endif
|
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c])
|
|
|
|
(void) printf("zio_fini: [%d] %llu != %llu\n",
|
|
|
|
(int)((c + 1) << SPA_MINBLOCKSHIFT),
|
|
|
|
(long long unsigned)zio_buf_cache_allocs[c],
|
|
|
|
(long long unsigned)zio_buf_cache_frees[c]);
|
2014-11-03 23:15:08 +03:00
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zio_buf_cache[c] != last_cache) {
|
|
|
|
last_cache = zio_buf_cache[c];
|
|
|
|
kmem_cache_destroy(zio_buf_cache[c]);
|
|
|
|
}
|
|
|
|
zio_buf_cache[c] = NULL;
|
|
|
|
|
|
|
|
if (zio_data_buf_cache[c] != last_data_cache) {
|
|
|
|
last_data_cache = zio_data_buf_cache[c];
|
|
|
|
kmem_cache_destroy(zio_data_buf_cache[c]);
|
|
|
|
}
|
|
|
|
zio_data_buf_cache[c] = NULL;
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_destroy(zio_link_cache);
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_destroy(zio_cache);
|
|
|
|
|
|
|
|
zio_inject_fini();
|
2013-01-23 13:54:30 +04:00
|
|
|
|
|
|
|
lz4_fini();
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Allocate and free I/O buffers
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
|
|
|
|
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
|
|
|
|
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
|
|
|
|
* excess / transient data in-core during a crashdump.
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zio_buf_alloc(size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
atomic_add_64(&zio_buf_cache_allocs[c], 1);
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-12-03 22:56:32 +03:00
|
|
|
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
|
|
|
|
* crashdump if the kernel panics. This exists so that we will limit the amount
|
|
|
|
* of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
|
|
|
|
* of kernel heap dumped to disk when the kernel panics)
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zio_data_buf_alloc(size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-12-03 22:56:32 +03:00
|
|
|
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_buf_free(void *buf, size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2016-07-22 18:52:49 +03:00
|
|
|
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
|
|
|
|
atomic_add_64(&zio_buf_cache_frees[c], 1);
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
kmem_cache_free(zio_buf_cache[c], buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_data_buf_free(void *buf, size_t size)
|
|
|
|
{
|
|
|
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
kmem_cache_free(zio_data_buf_cache[c], buf);
|
|
|
|
}
|
|
|
|
|
2017-01-05 22:10:07 +03:00
|
|
|
static void
|
|
|
|
zio_abd_free(void *abd, size_t size)
|
|
|
|
{
|
|
|
|
abd_free((abd_t *)abd);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Push and pop I/O transform buffers
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2016-06-02 07:04:53 +03:00
|
|
|
void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
|
2017-01-12 20:42:11 +03:00
|
|
|
zio_transform_func_t *transform)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2014-11-21 03:09:39 +03:00
|
|
|
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
/*
|
|
|
|
* Ensure that anyone expecting this zio to contain a linear ABD isn't
|
|
|
|
* going to get a nasty surprise when they try to access the data.
|
|
|
|
*/
|
|
|
|
IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
|
|
|
|
|
|
|
|
zt->zt_orig_abd = zio->io_abd;
|
2008-12-03 23:09:06 +03:00
|
|
|
zt->zt_orig_size = zio->io_size;
|
2008-11-20 23:01:55 +03:00
|
|
|
zt->zt_bufsize = bufsize;
|
2008-12-03 23:09:06 +03:00
|
|
|
zt->zt_transform = transform;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zt->zt_next = zio->io_transform_stack;
|
|
|
|
zio->io_transform_stack = zt;
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_abd = data;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_size = size;
|
|
|
|
}
|
|
|
|
|
2016-06-02 07:04:53 +03:00
|
|
|
void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_pop_transforms(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_transform_t *zt;
|
|
|
|
|
|
|
|
while ((zt = zio->io_transform_stack) != NULL) {
|
|
|
|
if (zt->zt_transform != NULL)
|
|
|
|
zt->zt_transform(zio,
|
2016-07-22 18:52:49 +03:00
|
|
|
zt->zt_orig_abd, zt->zt_orig_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zt->zt_bufsize != 0)
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(zio->io_abd);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_abd = zt->zt_orig_abd;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_size = zt->zt_orig_size;
|
|
|
|
zio->io_transform_stack = zt->zt_next;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
kmem_free(zt, sizeof (zio_transform_t));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* I/O transform callbacks for subblocks, decompression, and decryption
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
ASSERT(zio->io_size > size);
|
|
|
|
|
|
|
|
if (zio->io_type == ZIO_TYPE_READ)
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_copy(data, zio->io_abd, size);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2016-07-22 18:52:49 +03:00
|
|
|
if (zio->io_error == 0) {
|
|
|
|
void *tmp = abd_borrow_buf(data, size);
|
|
|
|
int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
|
|
|
|
zio->io_abd, tmp, zio->io_size, size);
|
|
|
|
abd_return_buf_copy(data, tmp, size);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
zio->io_error = SET_ERROR(EIO);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
static void
|
|
|
|
zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
void *tmp;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2017-11-08 22:12:59 +03:00
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
uint64_t dsobj = zio->io_bookmark.zb_objset;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
uint64_t lsize = BP_GET_LSIZE(bp);
|
|
|
|
dmu_object_type_t ot = BP_GET_TYPE(bp);
|
|
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
boolean_t no_crypt = B_FALSE;
|
|
|
|
|
|
|
|
ASSERT(BP_USES_CRYPT(bp));
|
|
|
|
ASSERT3U(size, !=, 0);
|
|
|
|
|
|
|
|
if (zio->io_error != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify the cksum of MACs stored in an indirect bp. It will always
|
|
|
|
* be possible to verify this since it does not require an encryption
|
|
|
|
* key.
|
|
|
|
*/
|
|
|
|
if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
|
|
|
|
zio_crypt_decode_mac_bp(bp, mac);
|
|
|
|
|
|
|
|
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
|
|
|
|
/*
|
|
|
|
* We haven't decompressed the data yet, but
|
|
|
|
* zio_crypt_do_indirect_mac_checksum() requires
|
|
|
|
* decompressed data to be able to parse out the MACs
|
|
|
|
* from the indirect block. We decompress it now and
|
|
|
|
* throw away the result after we are finished.
|
|
|
|
*/
|
|
|
|
tmp = zio_buf_alloc(lsize);
|
|
|
|
ret = zio_decompress_data(BP_GET_COMPRESS(bp),
|
|
|
|
zio->io_abd, tmp, zio->io_size, lsize);
|
|
|
|
if (ret != 0) {
|
|
|
|
ret = SET_ERROR(EIO);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
|
|
|
|
tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
|
|
|
|
zio_buf_free(tmp, lsize);
|
|
|
|
} else {
|
|
|
|
ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
|
|
|
|
zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
|
|
|
|
}
|
|
|
|
abd_copy(data, zio->io_abd, size);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is an authenticated block, just check the MAC. It would be
|
|
|
|
* nice to separate this out into its own flag, but for the moment
|
|
|
|
* enum zio_flag is out of bits.
|
|
|
|
*/
|
|
|
|
if (BP_IS_AUTHENTICATED(bp)) {
|
|
|
|
if (ot == DMU_OT_OBJSET) {
|
2017-11-08 22:12:59 +03:00
|
|
|
ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
|
|
|
|
dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
} else {
|
|
|
|
zio_crypt_decode_mac_bp(bp, mac);
|
2017-11-08 22:12:59 +03:00
|
|
|
ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
|
|
|
|
zio->io_abd, size, mac);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
abd_copy(data, zio->io_abd, size);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_crypt_decode_params_bp(bp, salt, iv);
|
|
|
|
|
|
|
|
if (ot == DMU_OT_INTENT_LOG) {
|
|
|
|
tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
|
|
|
|
zio_crypt_decode_mac_zil(tmp, mac);
|
|
|
|
abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
|
|
|
|
} else {
|
|
|
|
zio_crypt_decode_mac_bp(bp, mac);
|
|
|
|
}
|
|
|
|
|
2017-11-08 22:12:59 +03:00
|
|
|
ret = spa_do_crypt_abd(B_FALSE, spa, dsobj, bp, bp->blk_birth,
|
|
|
|
size, data, zio->io_abd, iv, mac, salt, &no_crypt);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (no_crypt)
|
|
|
|
abd_copy(data, zio->io_abd, size);
|
|
|
|
|
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
error:
|
|
|
|
/* assert that the key was found unless this was speculative */
|
|
|
|
ASSERT(ret != ENOENT || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there was a decryption / authentication error return EIO as
|
|
|
|
* the io_error. If this was not a speculative zio, create an ereport.
|
|
|
|
*/
|
|
|
|
if (ret == ECKSUM) {
|
2018-03-31 21:12:51 +03:00
|
|
|
zio->io_error = SET_ERROR(EIO);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
|
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
|
2017-11-08 22:12:59 +03:00
|
|
|
spa, NULL, &zio->io_bookmark, zio, 0, 0);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
zio->io_error = ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* I/O parent/child relationships and pipeline interlocks
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_walk_parents(zio_t *cio, zio_link_t **zl)
|
2009-02-18 23:51:31 +03:00
|
|
|
{
|
|
|
|
list_t *pl = &cio->io_parent_list;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
|
|
|
|
if (*zl == NULL)
|
2009-02-18 23:51:31 +03:00
|
|
|
return (NULL);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT((*zl)->zl_child == cio);
|
|
|
|
return ((*zl)->zl_parent);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_walk_children(zio_t *pio, zio_link_t **zl)
|
2009-02-18 23:51:31 +03:00
|
|
|
{
|
|
|
|
list_t *cl = &pio->io_child_list;
|
|
|
|
|
2017-12-21 20:13:06 +03:00
|
|
|
ASSERT(MUTEX_HELD(&pio->io_lock));
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
|
|
|
|
if (*zl == NULL)
|
2009-02-18 23:51:31 +03:00
|
|
|
return (NULL);
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT((*zl)->zl_parent == pio);
|
|
|
|
return ((*zl)->zl_child);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_unique_parent(zio_t *cio)
|
|
|
|
{
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
zio_t *pio = zio_walk_parents(cio, &zl);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
|
2009-02-18 23:51:31 +03:00
|
|
|
return (pio);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_add_child(zio_t *pio, zio_t *cio)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2014-11-21 03:09:39 +03:00
|
|
|
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Logical I/Os can have logical, gang, or vdev children.
|
|
|
|
* Gang I/Os can have gang or vdev children.
|
|
|
|
* Vdev I/Os can only have vdev children.
|
|
|
|
* The following ASSERT captures all of these constraints.
|
|
|
|
*/
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
zl->zl_parent = pio;
|
|
|
|
zl->zl_child = cio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&cio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
|
|
|
|
|
|
|
|
list_insert_head(&pio->io_child_list, zl);
|
|
|
|
list_insert_head(&cio->io_parent_list, zl);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
pio->io_child_count++;
|
|
|
|
cio->io_parent_count++;
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&cio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
ASSERT(zl->zl_parent == pio);
|
|
|
|
ASSERT(zl->zl_child == cio);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&cio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
list_remove(&pio->io_child_list, zl);
|
|
|
|
list_remove(&cio->io_parent_list, zl);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
pio->io_child_count--;
|
|
|
|
cio->io_parent_count--;
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&cio->io_lock);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
kmem_cache_free(zio_link_cache, zl);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
boolean_t waiting = B_FALSE;
|
|
|
|
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
ASSERT(zio->io_stall == NULL);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
|
|
|
|
if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
uint64_t *countp = &zio->io_children[c][wait];
|
|
|
|
if (*countp != 0) {
|
|
|
|
zio->io_stage >>= 1;
|
|
|
|
ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
|
|
|
|
zio->io_stall = countp;
|
|
|
|
waiting = B_TRUE;
|
|
|
|
break;
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
return (waiting);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:58:00 +04:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
|
|
|
|
{
|
|
|
|
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
|
|
|
|
int *errorp = &pio->io_child_error[zio->io_child_type];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
|
|
|
|
*errorp = zio_worst_error(*errorp, zio->io_error);
|
|
|
|
pio->io_reexecute |= zio->io_reexecute;
|
|
|
|
ASSERT3U(*countp, >, 0);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
|
|
|
|
(*countp)--;
|
|
|
|
|
|
|
|
if (*countp == 0 && pio->io_stall == countp) {
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_taskq_type_t type =
|
|
|
|
pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
|
|
|
|
ZIO_TASKQ_INTERRUPT;
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_stall = NULL;
|
|
|
|
mutex_exit(&pio->io_lock);
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Dispatch the parent zio in its own taskq so that
|
|
|
|
* the child can continue to make progress. This also
|
|
|
|
* prevents overflowing the stack when we have deeply nested
|
|
|
|
* parent-child relationships.
|
|
|
|
*/
|
|
|
|
zio_taskq_dispatch(pio, type, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
mutex_exit(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_inherit_child_errors(zio_t *zio, enum zio_child c)
|
|
|
|
{
|
|
|
|
if (zio->io_child_error[c] != 0 && zio->io_error == 0)
|
|
|
|
zio->io_error = zio->io_child_error[c];
|
|
|
|
}
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
int
|
2017-03-21 04:36:00 +03:00
|
|
|
zio_bookmark_compare(const void *x1, const void *x2)
|
2016-10-14 03:59:18 +03:00
|
|
|
{
|
|
|
|
const zio_t *z1 = x1;
|
|
|
|
const zio_t *z2 = x2;
|
|
|
|
|
2017-03-21 04:36:00 +03:00
|
|
|
if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
|
|
|
|
return (1);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2017-03-21 04:36:00 +03:00
|
|
|
if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
|
|
|
|
return (1);
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2017-03-21 04:36:00 +03:00
|
|
|
if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
|
|
|
|
return (-1);
|
|
|
|
if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
if (z1 < z2)
|
|
|
|
return (-1);
|
|
|
|
if (z1 > z2)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
return (0);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Create the various types of I/O (read, write, free, etc)
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
|
2016-07-11 20:45:52 +03:00
|
|
|
void *private, zio_type_t type, zio_priority_t priority,
|
|
|
|
enum zio_flag flags, vdev_t *vd, uint64_t offset,
|
|
|
|
const zbookmark_phys_t *zb, enum zio_stage stage,
|
|
|
|
enum zio_stage pipeline)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
|
|
|
|
ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
|
|
|
|
|
|
|
|
ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
|
|
|
|
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
|
|
|
|
ASSERT(vd || stage == ZIO_STAGE_OPEN);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
|
2016-07-11 20:45:52 +03:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
|
2015-01-30 22:25:19 +03:00
|
|
|
bzero(zio, sizeof (zio_t));
|
|
|
|
|
Identify locks flagged by lockdep
When running a kernel with CONFIG_LOCKDEP=y, lockdep reports possible
recursive locking in some cases and possible circular locking dependency
in others, within the SPL and ZFS modules.
This patch uses a mutex type defined in SPL, MUTEX_NOLOCKDEP, to mark
such mutexes when they are initialized. This mutex type causes
attempts to take or release those locks to be wrapped in lockdep_off()
and lockdep_on() calls to silence the dependency checker and allow the
use of lock_stats to examine contention.
For RW locks, it uses an analogous lock type, RW_NOLOCKDEP.
The goal is that these locks are ultimately changed back to type
MUTEX_DEFAULT or RW_DEFAULT, after the locks are annotated to reflect
their relationship (e.g. z_name_lock below) or any real problem with the
lock dependencies are fixed.
Some of the affected locks are:
tc_open_lock:
=============
This is an array of locks, all with same name, which txg_quiesce must
take all of in order to move txg to next state. All default to the same
lockdep class, and so to lockdep appears recursive.
zp->z_name_lock:
================
In zfs_rmdir,
dzp = znode for the directory (input to zfs_dirent_lock)
zp = znode for the entry being removed (output of zfs_dirent_lock)
zfs_rmdir()->zfs_dirent_lock() takes z_name_lock in dzp
zfs_rmdir() takes z_name_lock in zp
Since both dzp and zp are type znode_t, the locks have the same default
class, and lockdep considers it a possible recursive lock attempt.
l->l_rwlock:
============
zap_expand_leaf() sometimes creates two new zap leaf structures, via
these call paths:
zap_deref_leaf()->zap_get_leaf_byblk()->zap_leaf_open()
zap_expand_leaf()->zap_create_leaf()->zap_expand_leaf()->zap_create_leaf()
Because both zap_leaf_open() and zap_create_leaf() initialize
l->l_rwlock in their (separate) leaf structures, the lockdep class is
the same, and the linux kernel believes these might both be the same
lock, and emits a possible recursive lock warning.
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3895
2015-10-15 23:08:27 +03:00
|
|
|
mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
|
2015-01-30 22:25:19 +03:00
|
|
|
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
|
|
|
|
list_create(&zio->io_parent_list, sizeof (zio_link_t),
|
|
|
|
offsetof(zio_link_t, zl_parent_node));
|
|
|
|
list_create(&zio->io_child_list, sizeof (zio_link_t),
|
|
|
|
offsetof(zio_link_t, zl_child_node));
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_init(&zio->io_alloc_list);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd != NULL)
|
|
|
|
zio->io_child_type = ZIO_CHILD_VDEV;
|
|
|
|
else if (flags & ZIO_FLAG_GANG_CHILD)
|
|
|
|
zio->io_child_type = ZIO_CHILD_GANG;
|
2010-05-29 00:45:14 +04:00
|
|
|
else if (flags & ZIO_FLAG_DDT_CHILD)
|
|
|
|
zio->io_child_type = ZIO_CHILD_DDT;
|
2008-12-03 23:09:06 +03:00
|
|
|
else
|
|
|
|
zio->io_child_type = ZIO_CHILD_LOGICAL;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (bp != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_bp = (blkptr_t *)bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_bp_copy = *bp;
|
|
|
|
zio->io_bp_orig = *bp;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (type != ZIO_TYPE_WRITE ||
|
|
|
|
zio->io_child_type == ZIO_CHILD_DDT)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_logical = zio;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
|
|
|
|
pipeline |= ZIO_GANG_STAGES;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio->io_spa = spa;
|
|
|
|
zio->io_txg = txg;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_done = done;
|
|
|
|
zio->io_private = private;
|
|
|
|
zio->io_type = type;
|
|
|
|
zio->io_priority = priority;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_vd = vd;
|
|
|
|
zio->io_offset = offset;
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_orig_abd = zio->io_abd = data;
|
2016-07-11 20:45:52 +03:00
|
|
|
zio->io_orig_size = zio->io_size = psize;
|
|
|
|
zio->io_lsize = lsize;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_orig_flags = zio->io_flags = flags;
|
|
|
|
zio->io_orig_stage = zio->io_stage = stage;
|
|
|
|
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zb != NULL)
|
|
|
|
zio->io_bookmark = *zb;
|
|
|
|
|
|
|
|
if (pio != NULL) {
|
|
|
|
if (zio->io_logical == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_logical = pio->io_logical;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_GANG)
|
|
|
|
zio->io_gang_leader = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_add_child(pio, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-11-08 04:26:52 +04:00
|
|
|
taskq_init_ent(&zio->io_tqent);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_destroy(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_fini(&zio->io_alloc_list);
|
2015-01-30 22:25:19 +03:00
|
|
|
list_destroy(&zio->io_parent_list);
|
|
|
|
list_destroy(&zio->io_child_list);
|
|
|
|
mutex_destroy(&zio->io_lock);
|
|
|
|
cv_destroy(&zio->io_cv);
|
2008-12-03 23:09:06 +03:00
|
|
|
kmem_cache_free(zio_cache, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
|
2010-05-29 00:45:14 +04:00
|
|
|
void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
|
2009-02-18 23:51:31 +03:00
|
|
|
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
return (zio_null(NULL, spa, NULL, done, private, flags));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
void
|
|
|
|
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
|
|
|
|
{
|
|
|
|
if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
|
|
|
|
zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
|
|
|
|
bp, (longlong_t)BP_GET_TYPE(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
|
|
|
|
BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
|
|
|
|
zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
|
|
|
|
bp, (longlong_t)BP_GET_CHECKSUM(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
|
|
|
|
BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
|
|
|
|
zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
|
|
|
|
bp, (longlong_t)BP_GET_COMPRESS(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
|
|
|
|
zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
|
|
|
|
bp, (longlong_t)BP_GET_LSIZE(bp));
|
|
|
|
}
|
|
|
|
if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
|
|
|
|
zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
|
|
|
|
bp, (longlong_t)BP_GET_PSIZE(bp));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BP_IS_EMBEDDED(bp)) {
|
|
|
|
if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
|
|
|
|
zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
|
|
|
|
bp, (longlong_t)BPE_GET_ETYPE(bp));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pool-specific checks.
|
|
|
|
*
|
|
|
|
* Note: it would be nice to verify that the blk_birth and
|
|
|
|
* BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
|
|
|
|
* allows the birth time of log blocks (and dmu_sync()-ed blocks
|
|
|
|
* that are in the log) to be arbitrarily large.
|
|
|
|
*/
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
|
2014-11-26 20:57:30 +03:00
|
|
|
uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
|
2017-11-04 23:25:13 +03:00
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
if (vdevid >= spa->spa_root_vdev->vdev_children) {
|
|
|
|
zfs_panic_recover("blkptr at %p DVA %u has invalid "
|
|
|
|
"VDEV %llu",
|
|
|
|
bp, i, (longlong_t)vdevid);
|
2016-01-09 20:29:05 +03:00
|
|
|
continue;
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
2017-11-04 23:25:13 +03:00
|
|
|
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
|
2014-11-26 20:57:30 +03:00
|
|
|
if (vd == NULL) {
|
|
|
|
zfs_panic_recover("blkptr at %p DVA %u has invalid "
|
|
|
|
"VDEV %llu",
|
|
|
|
bp, i, (longlong_t)vdevid);
|
2016-01-09 20:29:05 +03:00
|
|
|
continue;
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
|
|
|
if (vd->vdev_ops == &vdev_hole_ops) {
|
|
|
|
zfs_panic_recover("blkptr at %p DVA %u has hole "
|
|
|
|
"VDEV %llu",
|
|
|
|
bp, i, (longlong_t)vdevid);
|
2016-01-09 20:29:05 +03:00
|
|
|
continue;
|
2014-11-26 20:57:30 +03:00
|
|
|
}
|
|
|
|
if (vd->vdev_ops == &vdev_missing_ops) {
|
|
|
|
/*
|
|
|
|
* "missing" vdevs are valid during import, but we
|
|
|
|
* don't have their detailed info (e.g. asize), so
|
|
|
|
* we can't perform any more checks on them.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
2017-11-04 23:25:13 +03:00
|
|
|
uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
|
|
|
|
uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
|
2014-11-26 20:57:30 +03:00
|
|
|
if (BP_IS_GANG(bp))
|
|
|
|
asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
|
|
|
|
if (offset + asize > vd->vdev_asize) {
|
|
|
|
zfs_panic_recover("blkptr at %p DVA %u has invalid "
|
|
|
|
"OFFSET %llu",
|
|
|
|
bp, i, (longlong_t)offset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2014-11-26 20:57:30 +03:00
|
|
|
zfs_blkptr_verify(spa, bp);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
|
2016-07-11 20:45:52 +03:00
|
|
|
data, size, size, done, private,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
|
|
|
ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
|
2016-05-15 18:02:28 +03:00
|
|
|
zio_done_func_t *ready, zio_done_func_t *children_ready,
|
|
|
|
zio_done_func_t *physdone, zio_done_func_t *done,
|
|
|
|
void *private, zio_priority_t priority, enum zio_flag flags,
|
|
|
|
const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
|
|
|
|
zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
|
|
|
|
zp->zp_compress >= ZIO_COMPRESS_OFF &&
|
|
|
|
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
|
2012-12-14 03:24:15 +04:00
|
|
|
DMU_OT_IS_VALID(zp->zp_type) &&
|
2008-12-03 23:09:06 +03:00
|
|
|
zp->zp_level < 32 &&
|
2010-05-29 00:45:14 +04:00
|
|
|
zp->zp_copies > 0 &&
|
2013-05-10 23:47:54 +04:00
|
|
|
zp->zp_copies <= spa_max_replication(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
|
|
|
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio->io_ready = ready;
|
2016-05-15 18:02:28 +03:00
|
|
|
zio->io_children_ready = children_ready;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_physdone = physdone;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop = *zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* Data can be NULL if we are going to call zio_write_override() to
|
|
|
|
* provide the already-allocated BP. But we may need the data to
|
|
|
|
* verify a dedup hit (if requested). In this case, don't try to
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* dedup (just take the already-allocated BP verbatim). Encrypted
|
|
|
|
* dedup blocks need data as well so we also disable dedup in this
|
|
|
|
* case.
|
2014-06-06 01:19:08 +04:00
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (data == NULL &&
|
|
|
|
(zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
|
2014-06-06 01:19:08 +04:00
|
|
|
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
uint64_t size, zio_done_func_t *done, void *private,
|
2014-06-25 22:37:59 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
|
2016-10-14 03:59:18 +03:00
|
|
|
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
|
|
|
|
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* We must reset the io_prop to match the values that existed
|
|
|
|
* when the bp was first written by dmu_sync() keeping in mind
|
|
|
|
* that nopwrite and dedup are mutually exclusive.
|
|
|
|
*/
|
|
|
|
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
|
|
|
|
zio->io_prop.zp_nopwrite = nopwrite;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_prop.zp_copies = copies;
|
|
|
|
zio->io_bp_override = bp;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
|
|
|
|
{
|
2014-06-06 01:19:08 +04:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
zfs_blkptr_verify(spa, bp);
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* The check for EMBEDDED is a performance optimization. We
|
|
|
|
* process the free here (by ignoring it) rather than
|
|
|
|
* putting it on the list and then processing it in zio_free_sync().
|
|
|
|
*/
|
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return;
|
2013-09-04 16:00:57 +04:00
|
|
|
metaslab_check_free(spa, bp);
|
2013-07-03 20:13:38 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Frees that are for the currently-syncing txg, are not going to be
|
|
|
|
* deferred, and which will not need to do a read (i.e. not GANG or
|
|
|
|
* DEDUP), can be processed immediately. Otherwise, put them on the
|
|
|
|
* in-memory list for later processing.
|
|
|
|
*/
|
|
|
|
if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
|
|
|
|
txg != spa->spa_syncing_txg ||
|
|
|
|
spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
|
|
|
|
bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
|
|
|
|
} else {
|
|
|
|
VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|
|
|
enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
2013-07-03 20:13:38 +04:00
|
|
|
enum zio_stage stage = ZIO_FREE_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(!BP_IS_HOLE(bp));
|
|
|
|
ASSERT(spa_syncing_txg(spa) == txg);
|
2013-05-06 21:14:52 +04:00
|
|
|
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
metaslab_check_free(spa, bp);
|
2013-10-07 15:30:22 +04:00
|
|
|
arc_freed(spa, bp);
|
2017-11-16 04:27:01 +03:00
|
|
|
dsl_scan_freed(spa, bp);
|
2013-09-04 16:00:57 +04:00
|
|
|
|
2013-07-03 20:13:38 +04:00
|
|
|
/*
|
|
|
|
* GANG and DEDUP blocks can induce a read (for the gang block header,
|
|
|
|
* or the DDT), so issue them asynchronously so that this thread is
|
|
|
|
* not tied up.
|
|
|
|
*/
|
|
|
|
if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
|
|
|
|
stage |= ZIO_STAGE_ISSUE_ASYNC;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
|
2016-07-11 20:45:52 +03:00
|
|
|
BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
|
|
|
|
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
|
2013-07-03 20:13:38 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|
|
|
zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
zfs_blkptr_verify(spa, bp);
|
2014-06-06 01:19:08 +04:00
|
|
|
|
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* A claim is an allocation of a specific block. Claims are needed
|
|
|
|
* to support immediate writes in the intent log. The issue is that
|
|
|
|
* immediate writes contain committed data, but in a txg that was
|
|
|
|
* *not* committed. Upon opening the pool after an unclean shutdown,
|
|
|
|
* the intent log claims all blocks that contain immediate write data
|
|
|
|
* so that the SPA knows they're in use.
|
|
|
|
*
|
|
|
|
* All claims *must* be resolved in the first txg -- before the SPA
|
|
|
|
* starts allocating blocks -- so that nothing is allocated twice.
|
2010-05-29 00:45:14 +04:00
|
|
|
* If txg == 0 we just verify that the block is claimable.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(txg == spa_first_txg(spa) || txg == 0);
|
|
|
|
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
|
2016-07-11 20:45:52 +03:00
|
|
|
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
|
|
|
|
flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT0(zio->io_queued_timestamp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_done_func_t *done, void *private, enum zio_flag flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
int c;
|
|
|
|
|
|
|
|
if (vd->vdev_children == 0) {
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
|
|
|
|
|
|
|
|
zio->io_cmd = cmd;
|
|
|
|
} else {
|
2009-02-18 23:51:31 +03:00
|
|
|
zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
for (c = 0; c < vd->vdev_children; c++)
|
|
|
|
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
done, private, flags));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, int checksum, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_children == 0);
|
|
|
|
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
|
|
|
|
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
|
|
|
|
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
|
|
|
|
private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
|
|
|
|
offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop.zp_checksum = checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_t *
|
|
|
|
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *data, int checksum, zio_done_func_t *done, void *private,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_children == 0);
|
|
|
|
ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
|
|
|
|
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
|
|
|
|
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
|
|
|
|
private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
|
|
|
|
offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_prop.zp_checksum = checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-06-16 01:47:05 +03:00
|
|
|
if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* zec checksums are necessarily destructive -- they modify
|
2008-12-03 23:09:06 +03:00
|
|
|
* the end of the write buffer to hold the verifier/checksum.
|
2008-11-20 23:01:55 +03:00
|
|
|
* Therefore, we must make a local copy in case the data is
|
2008-12-03 23:09:06 +03:00
|
|
|
* being written to multiple places in parallel.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *wbuf = abd_alloc_sametype(data, size);
|
|
|
|
abd_copy(wbuf, data, size);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio, wbuf, size, size, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Create a child I/O to do some work for us.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
zio_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
2017-01-21 00:17:55 +03:00
|
|
|
abd_t *data, uint64_t size, int type, zio_priority_t priority,
|
|
|
|
enum zio_flag flags, zio_done_func_t *done, void *private)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
/*
|
|
|
|
* vdev child I/Os do not propagate their error to the parent.
|
|
|
|
* Therefore, for correct operation the caller *must* check for
|
|
|
|
* and handle the error in the child i/o's done callback.
|
|
|
|
* The only exceptions are i/os that we don't care about
|
|
|
|
* (OPTIONAL or REPAIR).
|
|
|
|
*/
|
|
|
|
ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
|
|
|
|
done != NULL);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (type == ZIO_TYPE_READ && bp != NULL) {
|
|
|
|
/*
|
|
|
|
* If we have the bp, then the child should perform the
|
|
|
|
* checksum and the parent need not. This pushes error
|
|
|
|
* detection as close to the leaves as possible and
|
|
|
|
* eliminates redundant checksums in the interior nodes.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
|
|
|
|
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
if (vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
ASSERT0(vd->vdev_children);
|
2008-12-03 23:09:06 +03:00
|
|
|
offset += VDEV_LABEL_START_SIZE;
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
flags |= ZIO_VDEV_CHILD_FLAGS(pio);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we've decided to do a repair, the write is not speculative --
|
|
|
|
* even if the original read was.
|
|
|
|
*/
|
|
|
|
if (flags & ZIO_FLAG_IO_REPAIR)
|
|
|
|
flags &= ~ZIO_FLAG_SPECULATIVE;
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* If we're creating a child I/O that is not associated with a
|
|
|
|
* top-level vdev, then the child zio is not an allocating I/O.
|
|
|
|
* If this is a retried I/O then we ignore it since we will
|
|
|
|
* have already processed the original allocating I/O.
|
|
|
|
*/
|
|
|
|
if (flags & ZIO_FLAG_IO_ALLOCATING &&
|
|
|
|
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
|
2016-10-18 20:44:44 +03:00
|
|
|
ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa));
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
ASSERT(mc->mc_alloc_throttle_enabled);
|
|
|
|
ASSERT(type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
|
|
|
|
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
|
|
|
|
pio->io_child_type == ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
flags &= ~ZIO_FLAG_IO_ALLOCATING;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
|
2010-05-29 00:45:14 +04:00
|
|
|
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
|
|
|
|
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
zio->io_physdone = pio->io_physdone;
|
|
|
|
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
|
|
|
|
zio->io_logical->io_phys_children++;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
zio_type_t type, zio_priority_t priority, enum zio_flag flags,
|
2017-01-12 20:42:11 +03:00
|
|
|
zio_done_func_t *done, void *private)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(vd->vdev_ops->vdev_op_leaf);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
|
2016-07-11 20:45:52 +03:00
|
|
|
data, size, size, done, private, type, priority,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
|
2008-12-03 23:09:06 +03:00
|
|
|
vd, offset, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_flush(zio_t *zio, vdev_t *vd)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
NULL, NULL,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
|
|
|
zio_shrink(zio_t *zio, uint64_t size)
|
|
|
|
{
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3P(zio->io_executor, ==, NULL);
|
|
|
|
ASSERT3U(zio->io_orig_size, ==, zio->io_size);
|
|
|
|
ASSERT3U(size, <=, zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't shrink for raidz because of problems with the
|
|
|
|
* reconstruction when reading back less than the block size.
|
|
|
|
* Note, BP_IS_RAIDZ() assumes no compression.
|
|
|
|
*/
|
|
|
|
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
|
2016-07-11 20:45:52 +03:00
|
|
|
if (!BP_IS_RAIDZ(zio->io_bp)) {
|
|
|
|
/* we are not doing a raw write */
|
|
|
|
ASSERT3U(zio->io_size, ==, zio->io_lsize);
|
|
|
|
zio->io_orig_size = zio->io_size = zio->io_lsize = size;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Prepare to read and write logical blocks
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_read_bp_init(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
uint64_t psize =
|
|
|
|
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_child_type == ZIO_CHILD_LOGICAL &&
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
|
|
|
|
psize, psize, zio_decompress);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
|
|
|
|
BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
|
|
|
|
zio->io_child_type == ZIO_CHILD_LOGICAL) {
|
|
|
|
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
|
|
|
|
psize, psize, zio_decrypt);
|
|
|
|
}
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
|
2016-07-22 18:52:49 +03:00
|
|
|
int psize = BPE_GET_PSIZE(bp);
|
|
|
|
void *data = abd_borrow_buf(zio->io_abd, psize);
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2016-07-22 18:52:49 +03:00
|
|
|
decode_embedded_bp_compressed(bp, data);
|
|
|
|
abd_return_buf_copy(zio->io_abd, data, psize);
|
2014-06-06 01:19:08 +04:00
|
|
|
} else {
|
|
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
2014-06-06 01:19:08 +04:00
|
|
|
}
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
|
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
|
|
|
|
|
|
|
|
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
|
|
|
|
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_write_bp_init(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (!IO_IS_ALLOCATING(zio))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
|
|
|
|
|
|
|
|
if (zio->io_bp_override) {
|
2016-10-14 03:59:18 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(bp->blk_birth != zio->io_txg);
|
|
|
|
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
|
|
|
|
|
|
|
|
*bp = *zio->io_bp_override;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* If we've been overridden and nopwrite is set then
|
|
|
|
* set the flag accordingly to indicate that a nopwrite
|
|
|
|
* has already occurred.
|
|
|
|
*/
|
|
|
|
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
|
|
|
|
ASSERT(!zp->zp_dedup);
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
|
2013-05-10 23:47:54 +04:00
|
|
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(!zp->zp_nopwrite);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
2016-06-16 01:47:05 +03:00
|
|
|
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
|
|
|
|
!zp->zp_encrypt) {
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(bp, 1);
|
|
|
|
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We were unable to handle this as an override bp, treat
|
|
|
|
* it as a regular write I/O.
|
|
|
|
*/
|
2015-11-04 23:19:17 +03:00
|
|
|
zio->io_bp_override = NULL;
|
2016-10-14 03:59:18 +03:00
|
|
|
*bp = zio->io_bp_orig;
|
|
|
|
zio->io_pipeline = zio->io_orig_pipeline;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_write_compress(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
enum zio_compress compress = zp->zp_compress;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t lsize = zio->io_lsize;
|
|
|
|
uint64_t psize = zio->io_size;
|
|
|
|
int pass = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If our children haven't all reached the ready stage,
|
|
|
|
* wait for them and then repeat this pipeline stage.
|
|
|
|
*/
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
|
|
|
|
ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
|
2016-10-14 03:59:18 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
if (!IO_IS_ALLOCATING(zio))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
if (zio->io_children_ready != NULL) {
|
|
|
|
/*
|
|
|
|
* Now that all our children are ready, run the callback
|
|
|
|
* associated with this zio in case it wants to modify the
|
|
|
|
* data to be written.
|
|
|
|
*/
|
|
|
|
ASSERT3U(zp->zp_level, >, 0);
|
|
|
|
zio->io_children_ready(zio);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
|
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We're rewriting an existing block, which means we're
|
|
|
|
* working on behalf of spa_sync(). For spa_sync() to
|
|
|
|
* converge, it must eventually be the case that we don't
|
|
|
|
* have to allocate new blocks. But compression changes
|
|
|
|
* the blocksize, which forces a reallocate, and makes
|
|
|
|
* convergence take longer. Therefore, after the first
|
|
|
|
* few passes, stop compressing to ensure convergence.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
pass = spa_sync_pass(spa);
|
|
|
|
|
|
|
|
ASSERT(zio->io_txg == spa_syncing_txg(spa));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!BP_GET_DEDUP(bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-05-06 21:14:52 +04:00
|
|
|
if (pass >= zfs_sync_pass_dont_compress)
|
2008-12-03 23:09:06 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* Make sure someone doesn't change their mind on overwrites */
|
2014-06-06 01:19:08 +04:00
|
|
|
ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-11 20:45:52 +03:00
|
|
|
/* If it's a compressed write that is not raw, compress the buffer. */
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (compress != ZIO_COMPRESS_OFF &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
void *cbuf = zio_buf_alloc(lsize);
|
2016-07-22 18:52:49 +03:00
|
|
|
psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (psize == 0 || psize == lsize) {
|
2008-12-03 23:09:06 +03:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_buf_free(cbuf, lsize);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
} else if (!zp->zp_dedup && !zp->zp_encrypt &&
|
|
|
|
psize <= BPE_PAYLOAD_SIZE &&
|
2014-06-06 01:19:08 +04:00
|
|
|
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
|
|
|
|
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
|
|
|
|
encode_embedded_bp_compressed(bp,
|
|
|
|
cbuf, compress, lsize, psize);
|
|
|
|
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
|
|
|
|
BP_SET_TYPE(bp, zio->io_prop.zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
|
|
|
|
zio_buf_free(cbuf, lsize);
|
|
|
|
bp->blk_birth = zio->io_txg;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
ASSERT(spa_feature_is_active(spa,
|
|
|
|
SPA_FEATURE_EMBEDDED_DATA));
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
2015-05-20 07:14:01 +03:00
|
|
|
* Round up compressed size up to the ashift
|
|
|
|
* of the smallest-ashift device, and zero the tail.
|
|
|
|
* This ensures that the compressed size of the BP
|
|
|
|
* (and thus compressratio property) are correct,
|
|
|
|
* in that we charge for the padding used to fill out
|
|
|
|
* the last sector.
|
2014-06-06 01:19:08 +04:00
|
|
|
*/
|
2015-05-20 07:14:01 +03:00
|
|
|
ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
|
2017-11-04 23:25:13 +03:00
|
|
|
size_t rounded = (size_t)P2ROUNDUP(psize,
|
2015-05-20 07:14:01 +03:00
|
|
|
1ULL << spa->spa_min_ashift);
|
|
|
|
if (rounded >= lsize) {
|
2014-06-06 01:19:08 +04:00
|
|
|
compress = ZIO_COMPRESS_OFF;
|
|
|
|
zio_buf_free(cbuf, lsize);
|
2015-05-20 07:14:01 +03:00
|
|
|
psize = lsize;
|
2014-06-06 01:19:08 +04:00
|
|
|
} else {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *cdata = abd_get_from_buf(cbuf, lsize);
|
|
|
|
abd_take_ownership_of_buf(cdata, B_TRUE);
|
|
|
|
abd_zero_off(cdata, psize, rounded - psize);
|
2015-05-20 07:14:01 +03:00
|
|
|
psize = rounded;
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(zio, cdata,
|
2014-06-06 01:19:08 +04:00
|
|
|
psize, lsize, NULL);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We were unable to handle this as an override bp, treat
|
|
|
|
* it as a regular write I/O.
|
|
|
|
*/
|
|
|
|
zio->io_bp_override = NULL;
|
|
|
|
*bp = zio->io_bp_orig;
|
|
|
|
zio->io_pipeline = zio->io_orig_pipeline;
|
|
|
|
|
2018-02-21 23:28:52 +03:00
|
|
|
} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
|
|
|
|
zp->zp_type == DMU_OT_DNODE) {
|
|
|
|
/*
|
|
|
|
* The DMU actually relies on the zio layer's compression
|
|
|
|
* to free metadnode blocks that have had all contained
|
|
|
|
* dnodes freed. As a result, even when doing a raw
|
|
|
|
* receive, we must check whether the block can be compressed
|
|
|
|
* to a hole.
|
|
|
|
*/
|
|
|
|
psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
|
|
|
|
zio->io_abd, NULL, lsize);
|
|
|
|
if (psize == 0)
|
|
|
|
compress = ZIO_COMPRESS_OFF;
|
2016-07-11 20:45:52 +03:00
|
|
|
} else {
|
|
|
|
ASSERT3U(psize, !=, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* The final pass of spa_sync() must be all rewrites, but the first
|
|
|
|
* few passes offer a trade-off: allocating blocks defers convergence,
|
|
|
|
* but newly allocated blocks are sequential, so they can be written
|
|
|
|
* to disk faster. Therefore, we allow the first few passes of
|
|
|
|
* spa_sync() to allocate new blocks, but force rewrites after that.
|
|
|
|
* There should only be a handful of blocks after pass 1 in any case.
|
|
|
|
*/
|
2013-12-09 22:37:51 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
|
|
|
|
BP_GET_PSIZE(bp) == psize &&
|
2013-05-06 21:14:52 +04:00
|
|
|
pass >= zfs_sync_pass_rewrite) {
|
2010-08-26 20:52:39 +04:00
|
|
|
ASSERT(psize != 0);
|
2017-11-04 23:25:13 +03:00
|
|
|
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
|
|
|
|
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
|
|
|
|
} else {
|
|
|
|
BP_ZERO(bp);
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (psize == 0) {
|
2013-12-09 22:37:51 +04:00
|
|
|
if (zio->io_bp_orig.blk_birth != 0 &&
|
|
|
|
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
|
|
|
BP_SET_TYPE(bp, zp->zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zp->zp_level);
|
|
|
|
BP_SET_BIRTH(bp, zio->io_txg, 0);
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
} else {
|
|
|
|
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
2013-12-09 22:37:51 +04:00
|
|
|
BP_SET_TYPE(bp, zp->zp_type);
|
|
|
|
BP_SET_LEVEL(bp, zp->zp_level);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_PSIZE(bp, psize);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_COMPRESS(bp, compress);
|
|
|
|
BP_SET_CHECKSUM(bp, zp->zp_checksum);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(bp, zp->zp_dedup);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->zp_dedup) {
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
ASSERT(!zp->zp_encrypt ||
|
|
|
|
DMU_OT_IS_ENCRYPTED(zp->zp_type));
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
|
|
|
|
}
|
2013-05-10 23:47:54 +04:00
|
|
|
if (zp->zp_nopwrite) {
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_free_bp_init(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
|
|
|
|
if (BP_GET_DEDUP(bp))
|
|
|
|
zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Execute the I/O pipeline
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void
|
2013-05-06 23:24:30 +04:00
|
|
|
zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *spa = zio->io_spa;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_type_t t = zio->io_type;
|
2011-11-08 04:26:52 +04:00
|
|
|
int flags = (cutinline ? TQ_FRONT : 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* If we're a config writer or a probe, the normal issue and
|
|
|
|
* interrupt threads may all be blocked waiting for the config lock.
|
|
|
|
* In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
|
2008-12-03 23:09:06 +03:00
|
|
|
t = ZIO_TYPE_NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* A similar issue exists for the L2ARC write thread until L2ARC 2.0.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
|
|
|
|
t = ZIO_TYPE_NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
2013-05-06 23:24:30 +04:00
|
|
|
* If this is a high priority I/O, then use the high priority taskq if
|
|
|
|
* available.
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
|
|
|
if (zio->io_priority == ZIO_PRIORITY_NOW &&
|
2013-05-06 23:24:30 +04:00
|
|
|
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
q++;
|
|
|
|
|
|
|
|
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
|
2010-08-26 21:32:23 +04:00
|
|
|
|
2011-11-08 04:26:52 +04:00
|
|
|
/*
|
|
|
|
* NB: We are assuming that the zio can only be dispatched
|
|
|
|
* to a single taskq at a time. It would be a grievous error
|
|
|
|
* to dispatch the zio to another taskq at the same time.
|
|
|
|
*/
|
|
|
|
ASSERT(taskq_empty_ent(&zio->io_tqent));
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
|
|
|
|
flags, &zio->io_tqent);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static boolean_t
|
2013-05-06 23:24:30 +04:00
|
|
|
zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
kthread_t *executor = zio->io_executor;
|
|
|
|
spa_t *spa = zio->io_spa;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
|
|
|
|
uint_t i;
|
|
|
|
for (i = 0; i < tqs->stqs_count; i++) {
|
|
|
|
if (taskq_member(tqs->stqs_taskq[i], executor))
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (B_FALSE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_issue_async(zio_t *zio)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_interrupt(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-05-23 20:41:29 +03:00
|
|
|
void
|
|
|
|
zio_delay_interrupt(zio_t *zio)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The timeout_generic() function isn't defined in userspace, so
|
|
|
|
* rather than trying to implement the function, the zio delay
|
|
|
|
* functionality has been disabled for userspace builds.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
/*
|
|
|
|
* If io_target_timestamp is zero, then no delay has been registered
|
|
|
|
* for this IO, thus jump to the end of this function and "skip" the
|
|
|
|
* delay; issuing it directly to the zio layer.
|
|
|
|
*/
|
|
|
|
if (zio->io_target_timestamp != 0) {
|
|
|
|
hrtime_t now = gethrtime();
|
|
|
|
|
|
|
|
if (now >= zio->io_target_timestamp) {
|
|
|
|
/*
|
|
|
|
* This IO has already taken longer than the target
|
|
|
|
* delay to complete, so we don't want to delay it
|
|
|
|
* any longer; we "miss" the delay and issue it
|
|
|
|
* directly to the zio layer. This is likely due to
|
|
|
|
* the target latency being set to a value less than
|
|
|
|
* the underlying hardware can satisfy (e.g. delay
|
|
|
|
* set to 1ms, but the disks take 10ms to complete an
|
|
|
|
* IO request).
|
|
|
|
*/
|
|
|
|
|
|
|
|
DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
|
|
|
|
hrtime_t, now);
|
|
|
|
|
|
|
|
zio_interrupt(zio);
|
|
|
|
} else {
|
|
|
|
taskqid_t tid;
|
|
|
|
hrtime_t diff = zio->io_target_timestamp - now;
|
|
|
|
clock_t expire_at_tick = ddi_get_lbolt() +
|
|
|
|
NSEC_TO_TICK(diff);
|
|
|
|
|
|
|
|
DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
|
|
|
|
hrtime_t, now, hrtime_t, diff);
|
|
|
|
|
|
|
|
if (NSEC_TO_TICK(diff) == 0) {
|
|
|
|
/* Our delay is less than a jiffy - just spin */
|
|
|
|
zfs_sleep_until(zio->io_target_timestamp);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Use taskq_dispatch_delay() in the place of
|
|
|
|
* OpenZFS's timeout_generic().
|
|
|
|
*/
|
|
|
|
tid = taskq_dispatch_delay(system_taskq,
|
2016-12-12 21:46:26 +03:00
|
|
|
(task_func_t *)zio_interrupt,
|
2016-05-23 20:41:29 +03:00
|
|
|
zio, TQ_NOSLEEP, expire_at_tick);
|
2016-10-29 01:40:14 +03:00
|
|
|
if (tid == TASKQID_INVALID) {
|
2016-05-23 20:41:29 +03:00
|
|
|
/*
|
|
|
|
* Couldn't allocate a task. Just
|
|
|
|
* finish the zio without a delay.
|
|
|
|
*/
|
|
|
|
zio_interrupt(zio);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
|
|
|
|
zio_interrupt(zio);
|
|
|
|
}
|
|
|
|
|
2017-12-19 01:06:07 +03:00
|
|
|
static void
|
|
|
|
zio_deadman_impl(zio_t *pio)
|
|
|
|
{
|
|
|
|
zio_t *cio, *cio_next;
|
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
vdev_t *vd = pio->io_vd;
|
|
|
|
|
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
vdev_queue_t *vq = &vd->vdev_queue;
|
|
|
|
zbookmark_phys_t *zb = &pio->io_bookmark;
|
|
|
|
uint64_t delta = gethrtime() - pio->io_timestamp;
|
|
|
|
uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
|
|
|
|
|
|
|
|
zfs_dbgmsg("slow zio: zio=%p timestamp=%llu "
|
|
|
|
"delta=%llu queued=%llu io=%llu "
|
|
|
|
"path=%s last=%llu "
|
|
|
|
"type=%d priority=%d flags=0x%x "
|
|
|
|
"stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
|
|
|
|
"objset=%llu object=%llu level=%llu blkid=%llu "
|
|
|
|
"offset=%llu size=%llu error=%d",
|
|
|
|
pio, pio->io_timestamp,
|
|
|
|
delta, pio->io_delta, pio->io_delay,
|
|
|
|
vd->vdev_path, vq->vq_io_complete_ts,
|
|
|
|
pio->io_type, pio->io_priority, pio->io_flags,
|
|
|
|
pio->io_state, pio->io_pipeline, pio->io_pipeline_trace,
|
|
|
|
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
|
|
|
|
pio->io_offset, pio->io_size, pio->io_error);
|
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
|
|
|
|
pio->io_spa, vd, zb, pio, 0, 0);
|
|
|
|
|
|
|
|
if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
|
|
|
|
taskq_empty_ent(&pio->io_tqent)) {
|
|
|
|
zio_interrupt(pio);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio, &zl);
|
|
|
|
zio_deadman_impl(cio);
|
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Log the critical information describing this zio and all of its children
|
|
|
|
* using the zfs_dbgmsg() interface then post deadman event for the ZED.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_deadman(zio_t *pio, char *tag)
|
|
|
|
{
|
|
|
|
spa_t *spa = pio->io_spa;
|
|
|
|
char *name = spa_name(spa);
|
|
|
|
|
|
|
|
if (!zfs_deadman_enabled || spa_suspended(spa))
|
|
|
|
return;
|
|
|
|
|
|
|
|
zio_deadman_impl(pio);
|
|
|
|
|
|
|
|
switch (spa_get_deadman_failmode(spa)) {
|
|
|
|
case ZIO_FAILURE_MODE_WAIT:
|
|
|
|
zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ZIO_FAILURE_MODE_CONTINUE:
|
|
|
|
zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ZIO_FAILURE_MODE_PANIC:
|
|
|
|
fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Execute the I/O pipeline until one of the following occurs:
|
|
|
|
* (1) the I/O completes; (2) the pipeline stalls waiting for
|
|
|
|
* dependent child I/Os; (3) the I/O issues, so we're waiting
|
|
|
|
* for an I/O completion interrupt; (4) the I/O is delegated by
|
|
|
|
* vdev-level caching or aggregation; (5) the I/O is deferred
|
|
|
|
* due to vdev-level queueing; (6) the I/O is handed off to
|
|
|
|
* another thread. In all cases, the pipeline stops whenever
|
2013-07-03 06:00:16 +04:00
|
|
|
* there's no CPU work; it never burns a thread in cv_wait_io().
|
2008-12-03 23:09:06 +03:00
|
|
|
*
|
|
|
|
* There's no locking on io_stage because there's no legitimate way
|
|
|
|
* for multiple threads to be attempting to process the same I/O.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static zio_pipe_stage_t *zio_pipeline[];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
/*
|
|
|
|
* zio_execute() is a wrapper around the static function
|
|
|
|
* __zio_execute() so that we can force __zio_execute() to be
|
|
|
|
* inlined. This reduces stack overhead which is important
|
|
|
|
* because __zio_execute() is called recursively in several zio
|
|
|
|
* code paths. zio_execute() itself cannot be inlined because
|
|
|
|
* it is externally visible.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_execute(zio_t *zio)
|
2010-08-26 22:38:38 +04:00
|
|
|
{
|
2014-07-13 22:35:19 +04:00
|
|
|
fstrans_cookie_t cookie;
|
|
|
|
|
|
|
|
cookie = spl_fstrans_mark();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2014-07-13 22:35:19 +04:00
|
|
|
spl_fstrans_unmark(cookie);
|
2010-08-26 22:38:38 +04:00
|
|
|
}
|
|
|
|
|
2015-12-02 22:53:37 +03:00
|
|
|
/*
|
|
|
|
* Used to determine if in the current context the stack is sized large
|
|
|
|
* enough to allow zio_execute() to be called recursively. A minimum
|
|
|
|
* stack size of 16K is required to avoid needing to re-dispatch the zio.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zio_execute_stack_check(zio_t *zio)
|
|
|
|
{
|
|
|
|
#if !defined(HAVE_LARGE_STACKS)
|
|
|
|
dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
|
|
|
|
|
|
|
|
/* Executing in txg_sync_thread() context. */
|
|
|
|
if (dp && curthread == dp->dp_tx.tx_sync_thread)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
/* Pool initialization outside of zio_taskq context. */
|
|
|
|
if (dp && spa_is_initializing(dp->dp_spa) &&
|
|
|
|
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
|
|
|
|
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
|
|
|
|
return (B_TRUE);
|
|
|
|
#endif /* HAVE_LARGE_STACKS */
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline void
|
|
|
|
__zio_execute(zio_t *zio)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
zio->io_executor = curthread;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3U(zio->io_queued_timestamp, >, 0);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
while (zio->io_stage < ZIO_STAGE_DONE) {
|
2010-05-29 00:45:14 +04:00
|
|
|
enum zio_stage pipeline = zio->io_pipeline;
|
|
|
|
enum zio_stage stage = zio->io_stage;
|
2008-12-03 23:09:06 +03:00
|
|
|
int rv;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(!MUTEX_HELD(&zio->io_lock));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(ISP2(stage));
|
|
|
|
ASSERT(zio->io_stall == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
do {
|
|
|
|
stage <<= 1;
|
|
|
|
} while ((stage & pipeline) == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
ASSERT(stage <= ZIO_STAGE_DONE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* If we are in interrupt context and this pipeline stage
|
|
|
|
* will grab a config lock that is held across I/O,
|
2010-05-29 00:45:14 +04:00
|
|
|
* or may wait for an I/O that needs an interrupt thread
|
|
|
|
* to complete, issue async to avoid deadlock.
|
|
|
|
*
|
|
|
|
* For VDEV_IO_START, we cut in line so that the io will
|
|
|
|
* be sent to disk promptly.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2012-12-18 04:23:27 +04:00
|
|
|
if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
|
|
|
|
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
|
2015-12-02 22:53:37 +03:00
|
|
|
boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
|
|
|
|
zio_requeue_io_start_cut_in_line : B_FALSE;
|
2012-12-18 04:23:27 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-12-02 22:53:37 +03:00
|
|
|
* If the current context doesn't have large enough stacks
|
|
|
|
* the zio must be issued asynchronously to prevent overflow.
|
2012-12-18 04:23:27 +04:00
|
|
|
*/
|
2015-12-02 22:53:37 +03:00
|
|
|
if (zio_execute_stack_check(zio)) {
|
|
|
|
boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
|
|
|
|
zio_requeue_io_start_cut_in_line : B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
|
2008-12-03 23:09:06 +03:00
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_stage = stage;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio->io_pipeline_trace |= zio->io_stage;
|
2014-04-16 07:40:22 +04:00
|
|
|
rv = zio_pipeline[highbit64(stage) - 1](zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (rv == ZIO_PIPELINE_STOP)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(rv == ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Initiate I/O, either sync or async
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zio_wait(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2017-12-19 01:06:07 +03:00
|
|
|
long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
|
2008-12-03 23:09:06 +03:00
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
|
|
|
|
ASSERT3P(zio->io_executor, ==, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_waiter = curthread;
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT0(zio->io_queued_timestamp);
|
|
|
|
zio->io_queued_timestamp = gethrtime();
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
2017-12-19 01:06:07 +03:00
|
|
|
while (zio->io_executor != NULL) {
|
|
|
|
error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
|
|
|
|
ddi_get_lbolt() + timeout);
|
|
|
|
|
|
|
|
if (zfs_deadman_enabled && error == -1 &&
|
|
|
|
gethrtime() - zio->io_queued_timestamp >
|
|
|
|
spa_deadman_ziotime(zio->io_spa)) {
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
|
|
|
|
zio_deadman(zio, FTAG);
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
}
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&zio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
error = zio->io_error;
|
|
|
|
zio_destroy(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
zio_nowait(zio_t *zio)
|
|
|
|
{
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3P(zio->io_executor, ==, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
|
|
|
|
zio_unique_parent(zio) == NULL) {
|
2014-10-08 00:20:49 +04:00
|
|
|
zio_t *pio;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* This is a logical async I/O with no parent to wait for it.
|
2009-07-03 02:44:48 +04:00
|
|
|
* We add it to the spa_async_root_zio "Godfather" I/O which
|
|
|
|
* will ensure they complete prior to unloading the pool.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_t *spa = zio->io_spa;
|
2014-10-08 00:20:49 +04:00
|
|
|
kpreempt_disable();
|
|
|
|
pio = spa->spa_async_zio_root[CPU_SEQID];
|
|
|
|
kpreempt_enable();
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2014-10-08 00:20:49 +04:00
|
|
|
zio_add_child(pio, zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT0(zio->io_queued_timestamp);
|
|
|
|
zio->io_queued_timestamp = gethrtime();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
* Reexecute, cancel, or suspend/resume failed I/O
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_reexecute(zio_t *pio)
|
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *cio, *cio_next;
|
|
|
|
|
|
|
|
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(pio->io_gang_leader == NULL);
|
|
|
|
ASSERT(pio->io_gang_tree == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_flags = pio->io_orig_flags;
|
|
|
|
pio->io_stage = pio->io_orig_stage;
|
|
|
|
pio->io_pipeline = pio->io_orig_pipeline;
|
|
|
|
pio->io_reexecute = 0;
|
2013-05-10 23:47:54 +04:00
|
|
|
pio->io_flags |= ZIO_FLAG_REEXECUTED;
|
2016-10-14 03:59:18 +03:00
|
|
|
pio->io_pipeline_trace = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_error = 0;
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_state[w] = 0;
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_child_error[c] = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (IO_IS_ALLOCATING(pio))
|
|
|
|
BP_ZERO(pio->io_bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* As we reexecute pio's children, new children could be created.
|
2009-02-18 23:51:31 +03:00
|
|
|
* New children go to the head of pio's io_child_list, however,
|
2008-12-03 23:09:06 +03:00
|
|
|
* so we will (correctly) not reexecute them. The key is that
|
2009-02-18 23:51:31 +03:00
|
|
|
* the remainder of pio's io_child_list, from 'cio_next' onward,
|
|
|
|
* cannot be affected by any side effects of reexecuting 'cio'.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2017-11-04 23:25:13 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2016-10-14 03:59:18 +03:00
|
|
|
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio, &zl);
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2009-02-18 23:51:31 +03:00
|
|
|
pio->io_children[cio->io_child_type][w]++;
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_reexecute(cio);
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_enter(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2017-12-21 20:13:06 +03:00
|
|
|
mutex_exit(&pio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Now that all children have been reexecuted, execute the parent.
|
2009-07-03 02:44:48 +04:00
|
|
|
* We don't reexecute "The Godfather" I/O here as it's the
|
2017-02-17 22:48:20 +03:00
|
|
|
* responsibility of the caller to wait on it.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2016-10-14 03:59:18 +03:00
|
|
|
if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
|
|
|
|
pio->io_queued_timestamp = gethrtime();
|
2010-08-26 22:38:38 +04:00
|
|
|
__zio_execute(pio);
|
2016-10-14 03:59:18 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
2018-03-15 20:56:55 +03:00
|
|
|
zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
|
|
|
|
fm_panic("Pool '%s' has encountered an uncorrectable I/O "
|
|
|
|
"failure and the failure mode property for this pool "
|
|
|
|
"is set to panic.", spa_name(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-07-11 02:13:09 +04:00
|
|
|
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
|
|
|
|
"failure and has been suspended.\n", spa_name(spa));
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
|
|
|
|
NULL, NULL, 0, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (spa->spa_suspend_zio_root == NULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
|
|
|
|
ZIO_FLAG_GODFATHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-03-15 20:56:55 +03:00
|
|
|
spa->spa_suspended = reason;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio != NULL) {
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio != spa->spa_suspend_zio_root);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2009-02-18 23:51:31 +03:00
|
|
|
ASSERT(zio_unique_parent(zio) == NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_DONE);
|
|
|
|
zio_add_child(spa->spa_suspend_zio_root, zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_resume(spa_t *spa)
|
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *pio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Reexecute all previously suspended i/o.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
2018-03-15 20:56:55 +03:00
|
|
|
spa->spa_suspended = ZIO_SUSPEND_NONE;
|
2008-12-03 23:09:06 +03:00
|
|
|
cv_broadcast(&spa->spa_suspend_cv);
|
|
|
|
pio = spa->spa_suspend_zio_root;
|
|
|
|
spa->spa_suspend_zio_root = NULL;
|
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
|
|
|
|
|
|
|
if (pio == NULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
return (0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_reexecute(pio);
|
|
|
|
return (zio_wait(pio));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_resume_wait(spa_t *spa)
|
|
|
|
{
|
|
|
|
mutex_enter(&spa->spa_suspend_lock);
|
|
|
|
while (spa_suspended(spa))
|
|
|
|
cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
|
|
|
|
mutex_exit(&spa->spa_suspend_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* Gang blocks.
|
|
|
|
*
|
|
|
|
* A gang block is a collection of small blocks that looks to the DMU
|
|
|
|
* like one large block. When zio_dva_allocate() cannot find a block
|
|
|
|
* of the requested size, due to either severe fragmentation or the pool
|
|
|
|
* being nearly full, it calls zio_write_gang_block() to construct the
|
|
|
|
* block from smaller fragments.
|
|
|
|
*
|
|
|
|
* A gang block consists of a gang header (zio_gbh_phys_t) and up to
|
|
|
|
* three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
|
|
|
|
* an indirect block: it's an array of block pointers. It consumes
|
|
|
|
* only one sector and hence is allocatable regardless of fragmentation.
|
|
|
|
* The gang header's bps point to its gang members, which hold the data.
|
|
|
|
*
|
|
|
|
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
|
|
|
|
* as the verifier to ensure uniqueness of the SHA256 checksum.
|
|
|
|
* Critically, the gang block bp's blk_cksum is the checksum of the data,
|
|
|
|
* not the gang header. This ensures that data block signatures (needed for
|
|
|
|
* deduplication) are independent of how the block is physically stored.
|
|
|
|
*
|
|
|
|
* Gang blocks can be nested: a gang member may itself be a gang block.
|
|
|
|
* Thus every gang block is a tree in which root and all interior nodes are
|
|
|
|
* gang headers, and the leaves are normal blocks that contain user data.
|
|
|
|
* The root of the gang tree is called the gang leader.
|
|
|
|
*
|
|
|
|
* To perform any operation (read, rewrite, free, claim) on a gang block,
|
|
|
|
* zio_gang_assemble() first assembles the gang tree (minus data leaves)
|
|
|
|
* in the io_gang_tree field of the original logical i/o by recursively
|
|
|
|
* reading the gang leader and all gang headers below it. This yields
|
|
|
|
* an in-core tree containing the contents of every gang header and the
|
|
|
|
* bps for every constituent of the gang block.
|
|
|
|
*
|
|
|
|
* With the gang tree now assembled, zio_gang_issue() just walks the gang tree
|
|
|
|
* and invokes a callback on each bp. To free a gang block, zio_gang_issue()
|
|
|
|
* calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
|
|
|
|
* zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
|
|
|
|
* zio_read_gang() is a wrapper around zio_read() that omits reading gang
|
|
|
|
* headers, since we already have those in io_gang_tree. zio_rewrite_gang()
|
|
|
|
* performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
|
|
|
|
* of the gang header plus zio_checksum_compute() of the data to update the
|
|
|
|
* gang header's blk_cksum as described above.
|
|
|
|
*
|
|
|
|
* The two-phase assemble/issue model solves the problem of partial failure --
|
|
|
|
* what if you'd freed part of a gang block but then couldn't read the
|
|
|
|
* gang header for another part? Assembling the entire gang tree first
|
|
|
|
* ensures that all the necessary gang header I/O has succeeded before
|
|
|
|
* starting the actual work of free, claim, or write. Once the gang tree
|
|
|
|
* is assembled, free and claim are in-memory operations that cannot fail.
|
|
|
|
*
|
|
|
|
* In the event that a gang write fails, zio_dva_unallocate() walks the
|
|
|
|
* gang tree to immediately free (i.e. insert back into the space map)
|
|
|
|
* everything we've allocated. This ensures that we don't get ENOSPC
|
|
|
|
* errors during repeated suspend/resume cycles due to a flaky device.
|
|
|
|
*
|
|
|
|
* Gang rewrites only happen during sync-to-convergence. If we can't assemble
|
|
|
|
* the gang tree, we won't modify the block, so we can safely defer the free
|
|
|
|
* (knowing that the block is still intact). If we *can* assemble the gang
|
|
|
|
* tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
|
|
|
|
* each constituent bp and we can allocate a new block on the next sync pass.
|
|
|
|
*
|
|
|
|
* In all cases, the gang tree allows complete recovery from partial failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
static void
|
|
|
|
zio_gang_issue_func_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
abd_put(zio->io_abd);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static zio_t *
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn != NULL)
|
|
|
|
return (pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
|
|
|
|
BP_GET_PSIZE(bp), zio_gang_issue_func_done,
|
|
|
|
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
|
2008-12-03 23:09:06 +03:00
|
|
|
&pio->io_bookmark));
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
static zio_t *
|
|
|
|
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
if (gn != NULL) {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *gbh_abd =
|
|
|
|
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
|
|
|
|
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
|
|
|
|
&pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* As we rewrite each gang header, the pipeline will compute
|
|
|
|
* a new gang block header checksum for it; but no one will
|
|
|
|
* compute a new data checksum, so we do that here. The one
|
|
|
|
* exception is the gang leader: the pipeline already computed
|
|
|
|
* its data checksum because that stage precedes gang assembly.
|
|
|
|
* (Presently, nothing actually uses interior data checksums;
|
|
|
|
* this is just good hygiene.)
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (gn != pio->io_gang_leader->io_gang_tree) {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *buf = abd_get_offset(data, offset);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
|
2016-07-22 18:52:49 +03:00
|
|
|
buf, BP_GET_PSIZE(bp));
|
|
|
|
|
|
|
|
abd_put(buf);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* If we are here to damage data for testing purposes,
|
|
|
|
* leave the GBH alone so that we can detect the damage.
|
|
|
|
*/
|
|
|
|
if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
|
|
|
|
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_get_offset(data, offset), BP_GET_PSIZE(bp),
|
|
|
|
zio_gang_issue_func_done, NULL, pio->io_priority,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* ARGSUSED */
|
2016-07-22 18:52:49 +03:00
|
|
|
static zio_t *
|
|
|
|
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio)));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/* ARGSUSED */
|
2016-07-22 18:52:49 +03:00
|
|
|
static zio_t *
|
|
|
|
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
|
|
|
|
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
|
|
|
|
NULL,
|
|
|
|
zio_read_gang,
|
|
|
|
zio_rewrite_gang,
|
|
|
|
zio_free_gang,
|
|
|
|
zio_claim_gang,
|
|
|
|
NULL
|
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void zio_gang_tree_assemble_done(zio_t *zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static zio_gang_node_t *
|
|
|
|
zio_gang_node_alloc(zio_gang_node_t **gnpp)
|
|
|
|
{
|
|
|
|
zio_gang_node_t *gn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(*gnpp == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
|
2008-12-03 23:09:06 +03:00
|
|
|
gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
|
|
|
|
*gnpp = gn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (gn);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_free(zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = *gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(gn->gn_child[g] == NULL);
|
|
|
|
|
|
|
|
zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
|
|
|
kmem_free(gn, sizeof (*gn));
|
|
|
|
*gnpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_free(zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = *gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn == NULL)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_tree_free(&gn->gn_child[g]);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_free(gnpp);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(gio->io_gang_leader == gio);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_IS_GANG(bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
|
|
|
|
zio_gang_tree_assemble_done, gn, gio->io_priority,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
|
|
|
zio_gang_tree_assemble_done(zio_t *zio)
|
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = zio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_node_t *gn = zio->io_private;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(gio == zio_unique_parent(zio));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_count == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
/* this ABD was created from a linear buf in zio_gang_tree_assemble */
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_SHOULD_BYTESWAP(bp))
|
2016-07-22 18:52:49 +03:00
|
|
|
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_put(zio->io_abd);
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
|
|
|
|
if (!BP_IS_GANG(gbp))
|
|
|
|
continue;
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
|
|
|
|
uint64_t offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_IS_GANG(bp) == !!gn);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
|
|
|
|
ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If you're a gang header, your data is in gn->gn_gbh.
|
|
|
|
* If you're a gang member, your data is in 'data' and gn == NULL.
|
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (gn != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
|
|
|
|
if (BP_IS_HOLE(gbp))
|
|
|
|
continue;
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
|
|
|
|
offset);
|
|
|
|
offset += BP_GET_PSIZE(gbp);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (gn == gio->io_gang_tree)
|
2016-07-22 18:52:49 +03:00
|
|
|
ASSERT3U(gio->io_size, ==, offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio != pio)
|
|
|
|
zio_nowait(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_assemble(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
zio->io_gang_leader = zio;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_issue(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
|
|
|
|
0);
|
2008-12-03 23:09:06 +03:00
|
|
|
else
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_gang_tree_free(&zio->io_gang_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_gang_member_ready(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
dva_t *cdva = zio->io_bp->blk_dva;
|
|
|
|
dva_t *pdva = pio->io_bp->blk_dva;
|
|
|
|
uint64_t asize;
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERTV(zio_t *gio = zio->io_gang_leader);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_IS_HOLE(zio->io_bp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
|
|
|
|
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
|
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
|
|
|
|
ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(DVA_GET_GANG(&pdva[d]));
|
|
|
|
asize = DVA_GET_ASIZE(&pdva[d]);
|
|
|
|
asize += DVA_GET_ASIZE(&cdva[d]);
|
|
|
|
DVA_SET_ASIZE(&pdva[d], asize);
|
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
static void
|
|
|
|
zio_write_gang_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
abd_put(zio->io_abd);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_write_gang_block(zio_t *pio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_t *spa = pio->io_spa;
|
2016-10-14 03:59:18 +03:00
|
|
|
metaslab_class_t *mc = spa_normal_class(spa);
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = pio->io_bp;
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_t *gio = pio->io_gang_leader;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_t *zio;
|
|
|
|
zio_gang_node_t *gn, **gnpp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_gbh_phys_t *gbh;
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *gbh_abd;
|
2008-12-03 23:09:06 +03:00
|
|
|
uint64_t txg = pio->io_txg;
|
|
|
|
uint64_t resid = pio->io_size;
|
|
|
|
uint64_t lsize;
|
2010-05-29 00:45:14 +04:00
|
|
|
int copies = gio->io_prop.zp_copies;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
int gbh_copies;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_prop_t zp;
|
2017-11-04 23:25:13 +03:00
|
|
|
int error;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* encrypted blocks need DVA[2] free so encrypted gang headers can't
|
|
|
|
* have a third copy.
|
|
|
|
*/
|
|
|
|
gbh_copies = MIN(copies + 1, spa_max_replication(spa));
|
|
|
|
if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
|
|
|
|
gbh_copies = SPA_DVAS_PER_BP - 1;
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
|
2016-10-14 03:59:18 +03:00
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
|
|
|
|
|
|
|
|
flags |= METASLAB_ASYNC_ALLOC;
|
|
|
|
VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The logical zio has already placed a reservation for
|
|
|
|
* 'copies' allocation slots but gang blocks may require
|
|
|
|
* additional copies. These additional copies
|
|
|
|
* (i.e. gbh_copies - copies) are guaranteed to succeed
|
|
|
|
* since metaslab_class_throttle_reserve() always allows
|
|
|
|
* additional reservations for gang blocks.
|
|
|
|
*/
|
|
|
|
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
|
|
|
|
pio, flags));
|
|
|
|
}
|
|
|
|
|
|
|
|
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
|
2017-01-12 22:52:56 +03:00
|
|
|
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
|
|
|
|
&pio->io_alloc_list, pio);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
2016-10-14 03:59:18 +03:00
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we failed to allocate the gang block header then
|
|
|
|
* we remove any additional allocation reservations that
|
|
|
|
* we placed here. The original reservation will
|
|
|
|
* be removed when the logical I/O goes to the ready
|
|
|
|
* stage.
|
|
|
|
*/
|
|
|
|
metaslab_class_throttle_unreserve(mc,
|
|
|
|
gbh_copies - copies, pio);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_error = error;
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (pio == gio) {
|
|
|
|
gnpp = &gio->io_gang_tree;
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
gnpp = pio->io_private;
|
|
|
|
ASSERT(pio->io_ready == zio_write_gang_member_ready);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
gn = zio_gang_node_alloc(gnpp);
|
|
|
|
gbh = gn->gn_gbh;
|
|
|
|
bzero(gbh, SPA_GANGBLOCKSIZE);
|
2016-07-22 18:52:49 +03:00
|
|
|
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Create the gang header.
|
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
|
|
|
|
zio_write_gang_done, NULL, pio->io_priority,
|
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Create and nowait the gang children.
|
|
|
|
*/
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; resid != 0; resid -= lsize, g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
|
|
|
|
SPA_MINBLOCKSIZE);
|
|
|
|
ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zp.zp_checksum = gio->io_prop.zp_checksum;
|
2008-12-03 23:09:06 +03:00
|
|
|
zp.zp_compress = ZIO_COMPRESS_OFF;
|
|
|
|
zp.zp_type = DMU_OT_NONE;
|
|
|
|
zp.zp_level = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
zp.zp_copies = gio->io_prop.zp_copies;
|
2013-05-10 23:47:54 +04:00
|
|
|
zp.zp_dedup = B_FALSE;
|
|
|
|
zp.zp_dedup_verify = B_FALSE;
|
|
|
|
zp.zp_nopwrite = B_FALSE;
|
2017-09-12 23:15:11 +03:00
|
|
|
zp.zp_encrypt = gio->io_prop.zp_encrypt;
|
|
|
|
zp.zp_byteorder = gio->io_prop.zp_byteorder;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
|
|
|
|
bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
|
|
|
|
bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
|
|
|
|
lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
|
|
|
|
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
|
2016-10-14 03:59:18 +03:00
|
|
|
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
|
|
|
|
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Gang children won't throttle but we should
|
|
|
|
* account for their work, so reserve an allocation
|
|
|
|
* slot for them here.
|
|
|
|
*/
|
|
|
|
VERIFY(metaslab_class_throttle_reserve(mc,
|
|
|
|
zp.zp_copies, cio, flags));
|
|
|
|
}
|
|
|
|
zio_nowait(cio);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Set pio's pipeline to just wait for zio to finish.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
/*
|
|
|
|
* We didn't allocate this bp, so make sure it doesn't get unmarked.
|
|
|
|
*/
|
|
|
|
pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_nowait(zio);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
2016-06-16 01:47:05 +03:00
|
|
|
* The zio_nop_write stage in the pipeline determines if allocating a
|
|
|
|
* new bp is necessary. The nopwrite feature can handle writes in
|
|
|
|
* either syncing or open context (i.e. zil writes) and as a result is
|
|
|
|
* mutually exclusive with dedup.
|
|
|
|
*
|
|
|
|
* By leveraging a cryptographically secure checksum, such as SHA256, we
|
|
|
|
* can compare the checksums of the new data and the old to determine if
|
|
|
|
* allocating a new block is required. Note that our requirements for
|
|
|
|
* cryptographic strength are fairly weak: there can't be any accidental
|
|
|
|
* hash collisions, but we don't need to be secure against intentional
|
|
|
|
* (malicious) collisions. To trigger a nopwrite, you have to be able
|
|
|
|
* to write the file to begin with, and triggering an incorrect (hash
|
|
|
|
* collision) nopwrite is no worse than simply writing to the file.
|
|
|
|
* That said, there are no known attacks against the checksum algorithms
|
|
|
|
* used for nopwrite, assuming that the salt and the checksums
|
|
|
|
* themselves remain secret.
|
2013-05-10 23:47:54 +04:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zio_nop_write(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_LEVEL(bp) == 0);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
ASSERT(zp->zp_nopwrite);
|
|
|
|
ASSERT(!zp->zp_dedup);
|
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check to see if the original bp and the new bp have matching
|
|
|
|
* characteristics (i.e. same checksum, compression algorithms, etc).
|
|
|
|
* If they don't then just continue with the pipeline which will
|
|
|
|
* allocate a new bp.
|
|
|
|
*/
|
|
|
|
if (BP_IS_HOLE(bp_orig) ||
|
2016-06-16 01:47:05 +03:00
|
|
|
!(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_NOPWRITE) ||
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
|
2013-05-10 23:47:54 +04:00
|
|
|
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
|
|
|
|
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
|
|
|
|
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
|
|
|
|
zp->zp_copies != BP_GET_NDVAS(bp_orig))
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the checksums match then reset the pipeline so that we
|
|
|
|
* avoid allocating a new bp and issuing any I/O.
|
|
|
|
*/
|
|
|
|
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
|
2016-06-16 01:47:05 +03:00
|
|
|
ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_NOPWRITE);
|
2013-05-10 23:47:54 +04:00
|
|
|
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
|
|
|
|
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
|
|
|
|
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
|
|
|
|
ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
|
|
|
|
sizeof (uint64_t)) == 0);
|
|
|
|
|
|
|
|
*bp = *bp_orig;
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2010-05-29 00:45:14 +04:00
|
|
|
* Dedup
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
zio_ddt_child_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
ddp = ddt_phys_select(dde, bp);
|
|
|
|
if (zio->io_error == 0)
|
|
|
|
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
|
2016-07-22 18:52:49 +03:00
|
|
|
|
|
|
|
if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
|
|
|
|
dde->dde_repair_abd = zio->io_abd;
|
2010-05-29 00:45:14 +04:00
|
|
|
else
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(zio->io_abd);
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_ddt_read_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = ddt_repair_start(ddt, bp);
|
|
|
|
ddt_phys_t *ddp = dde->dde_phys;
|
|
|
|
ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
|
|
|
|
blkptr_t blk;
|
|
|
|
|
|
|
|
ASSERT(zio->io_vsd == NULL);
|
|
|
|
zio->io_vsd = dde;
|
|
|
|
|
|
|
|
if (ddp_self == NULL)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
|
|
|
|
continue;
|
|
|
|
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
|
|
|
|
&blk);
|
|
|
|
zio_nowait(zio_read(zio, zio->io_spa, &blk,
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_alloc_for_io(zio->io_size, B_TRUE),
|
|
|
|
zio->io_size, zio_ddt_child_read_done, dde,
|
|
|
|
zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
|
|
|
|
ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
zio_nowait(zio_read(zio, zio->io_spa, bp,
|
2016-07-22 18:52:49 +03:00
|
|
|
zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_ddt_read_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
if (zio->io_child_error[ZIO_CHILD_DDT]) {
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = zio->io_vsd;
|
|
|
|
if (ddt == NULL) {
|
|
|
|
ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
if (dde == NULL) {
|
|
|
|
zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
}
|
2016-07-22 18:52:49 +03:00
|
|
|
if (dde->dde_repair_abd != NULL) {
|
|
|
|
abd_copy(zio->io_abd, dde->dde_repair_abd,
|
|
|
|
zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_child_error[ZIO_CHILD_DDT] = 0;
|
|
|
|
}
|
|
|
|
ddt_repair_done(ddt, dde);
|
|
|
|
zio->io_vsd = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(zio->io_vsd == NULL);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
|
|
|
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2016-09-13 04:34:19 +03:00
|
|
|
boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
ASSERT(!(zio->io_bp_override && do_raw));
|
2016-07-11 20:45:52 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Note: we compare the original data, not the transformed data,
|
|
|
|
* because when zio->io_bp is an override bp, we will not have
|
|
|
|
* pushed the I/O transforms. That's an important optimization
|
|
|
|
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
|
2016-09-13 04:34:19 +03:00
|
|
|
* However, we should never get a raw, override zio so in these
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
* cases we can compare the io_abd directly. This is useful because
|
2016-09-13 04:34:19 +03:00
|
|
|
* it allows us to do dedup verification even if we don't have access
|
|
|
|
* to the original data (for instance, if the encryption keys aren't
|
|
|
|
* loaded).
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
2016-09-13 04:34:19 +03:00
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_t *lio = dde->dde_lead_zio[p];
|
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
if (lio != NULL && do_raw) {
|
|
|
|
return (lio->io_size != zio->io_size ||
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_cmp(zio->io_abd, lio->io_abd) != 0);
|
2016-09-13 04:34:19 +03:00
|
|
|
} else if (lio != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
return (lio->io_orig_size != zio->io_orig_size ||
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
if (ddp->ddp_phys_birth != 0 && do_raw) {
|
|
|
|
blkptr_t blk = *zio->io_bp;
|
|
|
|
uint64_t psize;
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *tmpabd;
|
2016-09-13 04:34:19 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
|
|
|
psize = BP_GET_PSIZE(&blk);
|
|
|
|
|
|
|
|
if (psize != zio->io_size)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
tmpabd = abd_alloc_for_io(psize, B_TRUE);
|
2016-09-13 04:34:19 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
|
2016-09-13 04:34:19 +03:00
|
|
|
psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
|
|
|
|
ZIO_FLAG_RAW, &zio->io_bookmark));
|
|
|
|
|
|
|
|
if (error == 0) {
|
2016-07-22 18:52:49 +03:00
|
|
|
if (abd_cmp(tmpabd, zio->io_abd) != 0)
|
2016-09-13 04:34:19 +03:00
|
|
|
error = SET_ERROR(ENOENT);
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_free(tmpabd);
|
2016-09-13 04:34:19 +03:00
|
|
|
ddt_enter(ddt);
|
|
|
|
return (error != 0);
|
|
|
|
} else if (ddp->ddp_phys_birth != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
arc_buf_t *abuf = NULL;
|
2014-12-06 20:24:32 +03:00
|
|
|
arc_flags_t aflags = ARC_FLAG_WAIT;
|
2010-05-29 00:45:14 +04:00
|
|
|
blkptr_t blk = *zio->io_bp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
|
|
|
|
|
2016-09-13 04:34:19 +03:00
|
|
|
if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_exit(ddt);
|
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
error = arc_read(NULL, spa, &blk,
|
2010-05-29 00:45:14 +04:00
|
|
|
arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&aflags, &zio->io_bookmark);
|
|
|
|
|
|
|
|
if (error == 0) {
|
2016-07-22 18:52:49 +03:00
|
|
|
if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_orig_size) != 0)
|
2016-09-13 04:34:19 +03:00
|
|
|
error = SET_ERROR(ENOENT);
|
2016-06-02 07:04:53 +03:00
|
|
|
arc_buf_destroy(abuf, &abuf);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
return (error != 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_child_write_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = zio->io_prop.zp_copies;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
zio_t *pio;
|
|
|
|
|
|
|
|
if (zio->io_error)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
|
|
|
|
ddt_phys_fill(ddp, zio->io_bp);
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2016-10-14 03:59:18 +03:00
|
|
|
while ((pio = zio_walk_parents(zio, &zl)) != NULL)
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_child_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = zio->io_prop.zp_copies;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(ddp->ddp_refcnt == 0);
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
dde->dde_lead_zio[p] = NULL;
|
|
|
|
|
|
|
|
if (zio->io_error == 0) {
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
while (zio_walk_parents(zio, &zl) != NULL)
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else {
|
|
|
|
ddt_phys_clear(ddp);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zio_ddt_ditto_write_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
int p = DDT_PHYS_DITTO;
|
2017-11-04 23:25:13 +03:00
|
|
|
ASSERTV(zio_prop_t *zp = &zio->io_prop);
|
2010-05-29 00:45:14 +04:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_t *ddt = ddt_select(zio->io_spa, bp);
|
|
|
|
ddt_entry_t *dde = zio->io_private;
|
|
|
|
ddt_phys_t *ddp = &dde->dde_phys[p];
|
|
|
|
ddt_key_t *ddk = &dde->dde_key;
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
|
|
|
|
ASSERT(ddp->ddp_refcnt == 0);
|
|
|
|
ASSERT(dde->dde_lead_zio[p] == zio);
|
|
|
|
dde->dde_lead_zio[p] = NULL;
|
|
|
|
|
|
|
|
if (zio->io_error == 0) {
|
|
|
|
ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
|
|
|
|
ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
|
|
|
|
ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
|
|
|
|
if (ddp->ddp_phys_birth != 0)
|
|
|
|
ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
|
|
|
|
ddt_phys_fill(ddp, bp);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_ddt_write(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t txg = zio->io_txg;
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
int p = zp->zp_copies;
|
|
|
|
int ditto_copies;
|
|
|
|
zio_t *cio = NULL;
|
|
|
|
zio_t *dio = NULL;
|
|
|
|
ddt_t *ddt = ddt_select(spa, bp);
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
|
|
|
|
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
|
2016-09-13 04:34:19 +03:00
|
|
|
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
dde = ddt_lookup(ddt, bp, B_TRUE);
|
|
|
|
ddp = &dde->dde_phys[p];
|
|
|
|
|
|
|
|
if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
|
|
|
|
/*
|
|
|
|
* If we're using a weak checksum, upgrade to a strong checksum
|
|
|
|
* and try again. If we're already using a strong checksum,
|
|
|
|
* we can't resolve it, so just convert to an ordinary write.
|
|
|
|
* (And automatically e-mail a paper to Nature?)
|
|
|
|
*/
|
2016-06-16 01:47:05 +03:00
|
|
|
if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
|
|
|
|
ZCHECKSUM_FLAG_DEDUP)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zp->zp_checksum = spa_dedup_checksum(spa);
|
|
|
|
zio_pop_transforms(zio);
|
|
|
|
zio->io_stage = ZIO_STAGE_OPEN;
|
|
|
|
BP_ZERO(bp);
|
|
|
|
} else {
|
2013-05-10 23:47:54 +04:00
|
|
|
zp->zp_dedup = B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
ddt_exit(ddt);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
|
|
|
|
ASSERT(ditto_copies < SPA_DVAS_PER_BP);
|
|
|
|
|
|
|
|
if (ditto_copies > ddt_ditto_copies_present(dde) &&
|
|
|
|
dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
|
|
|
|
zio_prop_t czp = *zp;
|
|
|
|
|
|
|
|
czp.zp_copies = ditto_copies;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we arrived here with an override bp, we won't have run
|
|
|
|
* the transform stack, so we won't have the data we need to
|
|
|
|
* generate a child i/o. So, toss the override bp and restart.
|
|
|
|
* This is safe, because using the override bp is just an
|
|
|
|
* optimization; and it's rare, so the cost doesn't matter.
|
|
|
|
*/
|
|
|
|
if (zio->io_bp_override) {
|
|
|
|
zio_pop_transforms(zio);
|
|
|
|
zio->io_stage = ZIO_STAGE_OPEN;
|
|
|
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
|
|
|
zio->io_bp_override = NULL;
|
|
|
|
BP_ZERO(bp);
|
|
|
|
ddt_exit(ddt);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
2016-07-11 20:45:52 +03:00
|
|
|
zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
|
2016-05-15 18:02:28 +03:00
|
|
|
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
|
|
|
|
if (ddp->ddp_phys_birth != 0)
|
|
|
|
ddt_bp_fill(ddp, bp, txg);
|
|
|
|
if (dde->dde_lead_zio[p] != NULL)
|
|
|
|
zio_add_child(zio, dde->dde_lead_zio[p]);
|
|
|
|
else
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else if (zio->io_bp_override) {
|
|
|
|
ASSERT(bp->blk_birth == txg);
|
|
|
|
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
|
|
|
|
ddt_phys_fill(ddp, bp);
|
|
|
|
ddt_phys_addref(ddp);
|
|
|
|
} else {
|
2016-07-22 18:52:49 +03:00
|
|
|
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
2016-07-11 20:45:52 +03:00
|
|
|
zio->io_orig_size, zio->io_orig_size, zp,
|
2016-05-15 18:02:28 +03:00
|
|
|
zio_ddt_child_write_ready, NULL, NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_child_write_done, dde, zio->io_priority,
|
|
|
|
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
dde->dde_lead_zio[p] = cio;
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_exit(ddt);
|
|
|
|
|
|
|
|
if (cio)
|
|
|
|
zio_nowait(cio);
|
|
|
|
if (dio)
|
|
|
|
zio_nowait(dio);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_entry_t *freedde; /* for debugging */
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static int
|
|
|
|
zio_ddt_free(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
ddt_t *ddt = ddt_select(spa, bp);
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
ddt_phys_t *ddp;
|
|
|
|
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
|
|
|
|
|
|
|
ddt_enter(ddt);
|
|
|
|
freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
|
2013-03-19 23:05:08 +04:00
|
|
|
if (dde) {
|
|
|
|
ddp = ddt_phys_select(dde, bp);
|
|
|
|
if (ddp)
|
|
|
|
ddt_phys_decref(ddp);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_exit(ddt);
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Allocate and free blocks
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
static zio_t *
|
|
|
|
zio_io_to_allocate(spa_t *spa)
|
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
|
|
|
|
|
|
|
|
zio = avl_first(&spa->spa_alloc_tree);
|
|
|
|
if (zio == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to place a reservation for this zio. If we're unable to
|
|
|
|
* reserve then we throttle.
|
|
|
|
*/
|
|
|
|
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
|
|
|
|
zio->io_prop.zp_copies, zio, 0)) {
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
avl_remove(&spa->spa_alloc_tree, zio);
|
|
|
|
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
|
|
|
|
|
|
|
|
return (zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_dva_throttle(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
zio_t *nio;
|
|
|
|
|
|
|
|
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
|
|
|
|
!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
|
|
|
|
zio->io_child_type == ZIO_CHILD_GANG ||
|
|
|
|
zio->io_flags & ZIO_FLAG_NODATA) {
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
|
|
|
|
ASSERT3U(zio->io_queued_timestamp, >, 0);
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
|
|
|
|
|
|
|
|
mutex_enter(&spa->spa_alloc_lock);
|
|
|
|
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
avl_add(&spa->spa_alloc_tree, zio);
|
|
|
|
|
|
|
|
nio = zio_io_to_allocate(zio->io_spa);
|
|
|
|
mutex_exit(&spa->spa_alloc_lock);
|
|
|
|
|
|
|
|
if (nio == zio)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
if (nio != NULL) {
|
|
|
|
ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
|
|
|
|
/*
|
|
|
|
* We are passing control to a new zio so make sure that
|
|
|
|
* it is processed by a different thread. We do this to
|
|
|
|
* avoid stack overflows that can occur when parents are
|
|
|
|
* throttled and children are making progress. We allow
|
|
|
|
* it to go to the head of the taskq since it's already
|
|
|
|
* been waiting.
|
|
|
|
*/
|
|
|
|
zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
|
|
|
|
}
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_allocate_dispatch(spa_t *spa)
|
|
|
|
{
|
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
mutex_enter(&spa->spa_alloc_lock);
|
|
|
|
zio = zio_io_to_allocate(spa);
|
|
|
|
mutex_exit(&spa->spa_alloc_lock);
|
|
|
|
if (zio == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
|
|
|
|
ASSERT0(zio->io_error);
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
|
|
|
zio_dva_allocate(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_class_t *mc = spa_normal_class(spa);
|
2008-11-20 23:01:55 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int error;
|
2011-07-26 23:08:52 +04:00
|
|
|
int flags = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_gang_leader == NULL) {
|
|
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
|
|
zio->io_gang_leader = zio;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(BP_IS_HOLE(bp));
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(BP_GET_NDVAS(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, >, 0);
|
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NODATA)
|
|
|
|
flags |= METASLAB_DONT_THROTTLE;
|
|
|
|
if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
|
|
|
|
flags |= METASLAB_GANG_CHILD;
|
|
|
|
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
|
|
|
|
flags |= METASLAB_ASYNC_ALLOC;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
error = metaslab_alloc(spa, mc, zio->io_size, bp,
|
2017-01-12 22:52:56 +03:00
|
|
|
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
|
|
|
|
&zio->io_alloc_list, zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
if (error != 0) {
|
2017-01-04 02:18:33 +03:00
|
|
|
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
|
2011-07-26 23:08:52 +04:00
|
|
|
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
|
|
|
|
error);
|
2008-12-03 23:09:06 +03:00
|
|
|
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
|
|
|
|
return (zio_write_gang_block(zio));
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = error;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_dva_free(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_dva_claim(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
|
|
|
|
if (error)
|
|
|
|
zio->io_error = error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Undo an allocation. This is used by zio_done() when an I/O fails
|
|
|
|
* and we want to give back the block we just allocated.
|
|
|
|
* This handles both normal blocks and gang blocks.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
|
|
|
|
{
|
|
|
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_bp_override == NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (!BP_IS_HOLE(bp))
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (gn != NULL) {
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_dva_unallocate(zio, gn->gn_child[g],
|
|
|
|
&gn->gn_gbh->zg_blkptr[g]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to allocate an intent log block. Return 0 on success, errno on failure.
|
|
|
|
*/
|
|
|
|
int
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
|
|
|
|
uint64_t size, boolean_t *slog)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
int error = 1;
|
2017-01-12 22:52:56 +03:00
|
|
|
zio_alloc_list_t io_alloc_list;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(txg > spa_syncing_txg(spa));
|
|
|
|
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_init(&io_alloc_list);
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
|
|
|
|
txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL);
|
|
|
|
if (error == 0) {
|
|
|
|
*slog = TRUE;
|
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
error = metaslab_alloc(spa, spa_normal_class(spa), size,
|
2017-01-12 22:52:56 +03:00
|
|
|
new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
|
|
|
|
&io_alloc_list, NULL);
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
if (error == 0)
|
|
|
|
*slog = FALSE;
|
2012-04-20 02:55:28 +04:00
|
|
|
}
|
2017-01-12 22:52:56 +03:00
|
|
|
metaslab_trace_fini(&io_alloc_list);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
BP_SET_LSIZE(new_bp, size);
|
|
|
|
BP_SET_PSIZE(new_bp, size);
|
|
|
|
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_CHECKSUM(new_bp,
|
|
|
|
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
|
|
|
|
? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
|
|
|
|
BP_SET_LEVEL(new_bp, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
BP_SET_DEDUP(new_bp, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* encrypted blocks will require an IV and salt. We generate
|
|
|
|
* these now since we will not be rewriting the bp at
|
|
|
|
* rewrite time.
|
|
|
|
*/
|
|
|
|
if (os->os_encrypted) {
|
|
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
|
|
|
|
|
|
BP_SET_CRYPT(new_bp, B_TRUE);
|
|
|
|
VERIFY0(spa_crypt_get_salt(spa,
|
|
|
|
dmu_objset_id(os), salt));
|
|
|
|
VERIFY0(zio_crypt_generate_iv(iv));
|
|
|
|
|
|
|
|
zio_crypt_encode_params_bp(new_bp, salt, iv);
|
|
|
|
}
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
} else {
|
|
|
|
zfs_dbgmsg("%s: zil block allocation failure: "
|
|
|
|
"size %llu, error %d", spa_name(spa), size, error);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Free an intent log block.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
|
|
|
void
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(!BP_IS_GANG(bp));
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free(spa, txg, bp);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Read and write to physical devices
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2014-10-21 02:07:45 +04:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Issue an I/O to the underlying vdev. Typically the issue pipeline
|
|
|
|
* stops after this stage and will resume upon I/O completion.
|
|
|
|
* However, there are instances where the vdev layer may need to
|
|
|
|
* continue the pipeline when an I/O was not issued. Since the I/O
|
|
|
|
* that was sent to the vdev layer might be different than the one
|
|
|
|
* currently active in the pipeline (see vdev_queue_io()), we explicitly
|
|
|
|
* force the underlying vdev layers to call either zio_execute() or
|
|
|
|
* zio_interrupt() to ensure that the pipeline continues with the correct I/O.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
|
|
|
zio_vdev_io_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
uint64_t align;
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
zio->io_delay = 0;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd == NULL) {
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
|
|
spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* The mirror_ops handle multiple DVAs in a single BP.
|
|
|
|
*/
|
2014-10-21 02:07:45 +04:00
|
|
|
vdev_mirror_ops.vdev_op_io_start(zio);
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT3P(zio->io_logical, !=, zio);
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) {
|
|
|
|
/*
|
|
|
|
* Note: the code can handle other kinds of writes,
|
|
|
|
* but we don't expect them.
|
|
|
|
*/
|
|
|
|
ASSERT(zio->io_flags &
|
|
|
|
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
}
|
2016-10-14 03:59:18 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
align = 1ULL << vd->vdev_top->vdev_ashift;
|
|
|
|
|
2014-09-23 03:42:03 +04:00
|
|
|
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
|
|
|
|
P2PHASE(zio->io_size, align) != 0) {
|
|
|
|
/* Transform logical writes to be a full physical block size. */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t asize = P2ROUNDUP(zio->io_size, align);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
|
2012-10-25 02:22:31 +04:00
|
|
|
ASSERT(vd == vd->vdev_top);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE) {
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_copy(abuf, zio->io_abd, zio->io_size);
|
|
|
|
abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-09-23 03:42:03 +04:00
|
|
|
/*
|
|
|
|
* If this is not a physical io, make sure that it is properly aligned
|
|
|
|
* before proceeding.
|
|
|
|
*/
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
|
|
|
|
ASSERT0(P2PHASE(zio->io_offset, align));
|
|
|
|
ASSERT0(P2PHASE(zio->io_size, align));
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* For physical writes, we allow 512b aligned writes and assume
|
|
|
|
* the device will perform a read-modify-write as necessary.
|
|
|
|
*/
|
|
|
|
ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
|
|
|
|
ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
|
2009-01-16 00:59:39 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is a repair I/O, and there's no self-healing involved --
|
|
|
|
* that is, we're just resilvering what we expect to resilver --
|
|
|
|
* then don't do the I/O unless zio's txg is actually in vd's DTL.
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
* This prevents spurious resilvering.
|
|
|
|
*
|
|
|
|
* There are a few ways that we can end up creating these spurious
|
|
|
|
* resilver i/os:
|
|
|
|
*
|
|
|
|
* 1. A resilver i/o will be issued if any DVA in the BP has a
|
|
|
|
* dirty DTL. The mirror code will issue resilver writes to
|
|
|
|
* each DVA, including the one(s) that are not on vdevs with dirty
|
|
|
|
* DTLs.
|
|
|
|
*
|
|
|
|
* 2. With nested replication, which happens when we have a
|
|
|
|
* "replacing" or "spare" vdev that's a child of a mirror or raidz.
|
|
|
|
* For example, given mirror(replacing(A+B), C), it's likely that
|
|
|
|
* only A is out of date (it's the new device). In this case, we'll
|
|
|
|
* read from C, then use the data to resilver A+B -- but we don't
|
|
|
|
* actually want to resilver B, just A. The top-level mirror has no
|
|
|
|
* way to know this, so instead we just discard unnecessary repairs
|
|
|
|
* as we work our way down the vdev tree.
|
|
|
|
*
|
|
|
|
* 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
|
|
|
|
* The same logic applies to any form of nested replication: ditto
|
|
|
|
* + mirror, RAID-Z + replacing, etc.
|
|
|
|
*
|
|
|
|
* However, indirect vdevs point off to other vdevs which may have
|
|
|
|
* DTL's, so we never bypass them. The child i/os on concrete vdevs
|
|
|
|
* will be properly bypassed instead.
|
2009-01-16 00:59:39 +03:00
|
|
|
*/
|
|
|
|
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
|
|
|
|
zio->io_txg != 0 && /* not a delegated i/o */
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
vd->vdev_ops != &vdev_indirect_ops &&
|
2009-01-16 00:59:39 +03:00
|
|
|
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
zio_vdev_io_bypass(zio);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
|
2009-02-18 23:51:31 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zio = vdev_queue_io(zio)) == NULL)
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
|
|
|
|
if (!vdev_accessible(vd, zio)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_interrupt(zio);
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
|
|
|
}
|
2017-08-02 19:08:38 +03:00
|
|
|
zio->io_delay = gethrtime();
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2014-10-21 02:07:45 +04:00
|
|
|
vd->vdev_ops->vdev_op_io_start(zio);
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zio_vdev_io_done(zio_t *zio)
|
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
|
|
|
|
boolean_t unexpected_error = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
if (zio->io_delay)
|
|
|
|
zio->io_delay = gethrtime() - zio->io_delay;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
|
|
|
|
vdev_queue_io_done(zio);
|
|
|
|
|
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE)
|
|
|
|
vdev_cache_write(zio);
|
|
|
|
|
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
2017-08-15 01:17:15 +03:00
|
|
|
zio->io_error = zio_handle_device_injections(vd, zio,
|
|
|
|
EIO, EILSEQ);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
|
|
|
zio->io_error = zio_handle_label_injection(zio, EIO);
|
|
|
|
|
|
|
|
if (zio->io_error) {
|
|
|
|
if (!vdev_accessible(vd, zio)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
unexpected_error = B_TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ops->vdev_op_io_done(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-07-18 21:43:55 +03:00
|
|
|
if (unexpected_error)
|
2009-02-18 23:51:31 +03:00
|
|
|
VERIFY(vdev_probe(vd, zio) == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-12-21 20:13:06 +03:00
|
|
|
/*
|
|
|
|
* This function is used to change the priority of an existing zio that is
|
|
|
|
* currently in-flight. This is used by the arc to upgrade priority in the
|
|
|
|
* event that a demand read is made for a block that is currently queued
|
|
|
|
* as a scrub or async read IO. Otherwise, the high priority read request
|
|
|
|
* would end up having to wait for the lower priority IO.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_change_priority(zio_t *pio, zio_priority_t priority)
|
|
|
|
{
|
|
|
|
zio_t *cio, *cio_next;
|
|
|
|
zio_link_t *zl = NULL;
|
|
|
|
|
|
|
|
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
|
|
|
|
|
|
|
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
vdev_queue_change_io_priority(pio, priority);
|
|
|
|
} else {
|
|
|
|
pio->io_priority = priority;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
|
|
|
|
cio_next = zio_walk_children(pio, &zl);
|
|
|
|
zio_change_priority(cio, priority);
|
|
|
|
}
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* For non-raidz ZIOs, we can just copy aside the bad data read from the
|
|
|
|
* disk, and use that to finish the checksum ereport later.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
|
2017-01-05 22:10:07 +03:00
|
|
|
const abd_t *good_buf)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
/* no processing needed */
|
|
|
|
zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
void
|
|
|
|
zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
|
|
|
|
{
|
2017-01-05 22:10:07 +03:00
|
|
|
void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-01-05 22:10:07 +03:00
|
|
|
abd_copy(abd, zio->io_abd, zio->io_size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
zcr->zcr_cbinfo = zio->io_size;
|
2017-01-05 22:10:07 +03:00
|
|
|
zcr->zcr_cbdata = abd;
|
2010-05-29 00:45:14 +04:00
|
|
|
zcr->zcr_finish = zio_vsd_default_cksum_finish;
|
2017-01-05 22:10:07 +03:00
|
|
|
zcr->zcr_free = zio_abd_free;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
|
|
|
zio_vdev_io_assess(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *vd = zio->io_vd;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
|
|
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
|
|
|
|
|
|
|
|
if (zio->io_vsd != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_vsd_ops->vsd_free(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_vsd = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio_injection_enabled && zio->io_error == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = zio_handle_fault_injection(zio, EIO);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the I/O failed, determine whether we should attempt to retry it.
|
2010-05-29 00:45:14 +04:00
|
|
|
*
|
|
|
|
* On retry, we cut in line in the issue queue, since we don't want
|
|
|
|
* compression/checksumming/etc. work to prevent our (cheap) IO reissue.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error && vd == NULL &&
|
|
|
|
!(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
|
2008-11-20 23:01:55 +03:00
|
|
|
zio->io_error = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_IO_RETRY |
|
|
|
|
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
|
|
|
|
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
|
|
|
|
zio_requeue_io_start_cut_in_line);
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If we got an error on a leaf device, convert it to ENXIO
|
|
|
|
* if the device is not accessible at all.
|
|
|
|
*/
|
|
|
|
if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
!vdev_accessible(vd, zio))
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we can't write to an interior vdev (mirror or RAID-Z),
|
|
|
|
* set vdev_cant_write so that we stop trying to allocate from it.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
|
2013-09-04 16:00:57 +04:00
|
|
|
vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
|
2008-12-03 23:09:06 +03:00
|
|
|
vd->vdev_cant_write = B_TRUE;
|
2013-09-04 16:00:57 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2017-02-04 20:23:50 +03:00
|
|
|
/*
|
|
|
|
* If a cache flush returns ENOTSUP or ENOTTY, we know that no future
|
|
|
|
* attempts will ever succeed. In this case we set a persistent bit so
|
|
|
|
* that we don't bother with it in the future.
|
|
|
|
*/
|
|
|
|
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
|
|
|
|
zio->io_type == ZIO_TYPE_IOCTL &&
|
|
|
|
zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
|
|
|
|
vd->vdev_nowritecache = B_TRUE;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error)
|
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
|
|
|
|
zio->io_physdone != NULL) {
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
|
|
|
|
zio->io_physdone(zio->io_logical);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_reissue(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
|
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_redone(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage >>= 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zio_vdev_io_bypass(zio_t *zio)
|
|
|
|
{
|
|
|
|
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
|
|
|
|
ASSERT(zio->io_error == 0);
|
|
|
|
|
|
|
|
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Encrypt and store encryption parameters
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
|
|
|
|
* managing the storage of encryption parameters and passing them to the
|
|
|
|
* lower-level encryption functions.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zio_encrypt(zio_t *zio)
|
|
|
|
{
|
|
|
|
zio_prop_t *zp = &zio->io_prop;
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
uint64_t psize = BP_GET_PSIZE(bp);
|
2017-11-08 22:12:59 +03:00
|
|
|
uint64_t dsobj = zio->io_bookmark.zb_objset;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_object_type_t ot = BP_GET_TYPE(bp);
|
|
|
|
void *enc_buf = NULL;
|
|
|
|
abd_t *eabd = NULL;
|
|
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
boolean_t no_crypt = B_FALSE;
|
|
|
|
|
|
|
|
/* the root zio already encrypted the data */
|
|
|
|
if (zio->io_child_type == ZIO_CHILD_GANG)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
/* only ZIL blocks are re-encrypted on rewrite */
|
|
|
|
if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
|
|
|
|
BP_SET_CRYPT(bp, B_FALSE);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if we are doing raw encryption set the provided encryption params */
|
|
|
|
if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
|
2017-11-08 22:12:59 +03:00
|
|
|
ASSERT0(BP_GET_LEVEL(bp));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
BP_SET_BYTEORDER(bp, zp->zp_byteorder);
|
|
|
|
if (ot != DMU_OT_OBJSET)
|
|
|
|
zio_crypt_encode_mac_bp(bp, zp->zp_mac);
|
2017-11-08 22:12:59 +03:00
|
|
|
|
|
|
|
/* dnode blocks must be written out in the provided byteorder */
|
|
|
|
if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
|
|
|
|
ot == DMU_OT_DNODE) {
|
|
|
|
void *bswap_buf = zio_buf_alloc(psize);
|
|
|
|
abd_t *babd = abd_get_from_buf(bswap_buf, psize);
|
|
|
|
|
|
|
|
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
|
|
|
|
abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
|
|
|
|
dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
|
|
|
|
psize);
|
|
|
|
|
|
|
|
abd_take_ownership_of_buf(babd, B_TRUE);
|
|
|
|
zio_push_transform(zio, babd, psize, psize, NULL);
|
|
|
|
}
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (DMU_OT_IS_ENCRYPTED(ot))
|
|
|
|
zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* indirect blocks only maintain a cksum of the lower level MACs */
|
|
|
|
if (BP_GET_LEVEL(bp) > 0) {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
|
|
|
|
zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
|
|
|
|
mac));
|
|
|
|
zio_crypt_encode_mac_bp(bp, mac);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Objset blocks are a special case since they have 2 256-bit MACs
|
|
|
|
* embedded within them.
|
|
|
|
*/
|
|
|
|
if (ot == DMU_OT_OBJSET) {
|
|
|
|
ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
|
|
|
|
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
2017-11-08 22:12:59 +03:00
|
|
|
VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
|
|
|
|
zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unencrypted object types are only authenticated with a MAC */
|
|
|
|
if (!DMU_OT_IS_ENCRYPTED(ot)) {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
2017-11-08 22:12:59 +03:00
|
|
|
VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
|
|
|
|
zio->io_abd, psize, mac));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_crypt_encode_mac_bp(bp, mac);
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Later passes of sync-to-convergence may decide to rewrite data
|
|
|
|
* in place to avoid more disk reallocations. This presents a problem
|
|
|
|
* for encryption because this consitutes rewriting the new data with
|
|
|
|
* the same encryption key and IV. However, this only applies to blocks
|
|
|
|
* in the MOS (particularly the spacemaps) and we do not encrypt the
|
|
|
|
* MOS. We assert that the zio is allocating or an intent log write
|
|
|
|
* to enforce this.
|
|
|
|
*/
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
|
|
|
|
ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
|
|
|
|
ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
|
|
|
|
ASSERT3U(psize, !=, 0);
|
|
|
|
|
|
|
|
enc_buf = zio_buf_alloc(psize);
|
|
|
|
eabd = abd_get_from_buf(enc_buf, psize);
|
|
|
|
abd_take_ownership_of_buf(eabd, B_TRUE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For an explanation of what encryption parameters are stored
|
|
|
|
* where, see the block comment in zio_crypt.c.
|
|
|
|
*/
|
|
|
|
if (ot == DMU_OT_INTENT_LOG) {
|
|
|
|
zio_crypt_decode_params_bp(bp, salt, iv);
|
|
|
|
} else {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Perform the encryption. This should not fail */
|
2017-11-08 22:12:59 +03:00
|
|
|
VERIFY0(spa_do_crypt_abd(B_TRUE, spa, dsobj, bp, zio->io_txg,
|
|
|
|
psize, zio->io_abd, eabd, iv, mac, salt, &no_crypt));
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
|
|
|
/* encode encryption metadata into the bp */
|
|
|
|
if (ot == DMU_OT_INTENT_LOG) {
|
|
|
|
/*
|
|
|
|
* ZIL blocks store the MAC in the embedded checksum, so the
|
|
|
|
* transform must always be applied.
|
|
|
|
*/
|
|
|
|
zio_crypt_encode_mac_zil(enc_buf, mac);
|
|
|
|
zio_push_transform(zio, eabd, psize, psize, NULL);
|
|
|
|
} else {
|
|
|
|
BP_SET_CRYPT(bp, B_TRUE);
|
|
|
|
zio_crypt_encode_params_bp(bp, salt, iv);
|
|
|
|
zio_crypt_encode_mac_bp(bp, mac);
|
|
|
|
|
|
|
|
if (no_crypt) {
|
|
|
|
ASSERT3U(ot, ==, DMU_OT_DNODE);
|
|
|
|
abd_free(eabd);
|
|
|
|
} else {
|
|
|
|
zio_push_transform(zio, eabd, psize, psize, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ==========================================================================
|
|
|
|
* Generate and verify checksums
|
|
|
|
* ==========================================================================
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zio_checksum_generate(zio_t *zio)
|
|
|
|
{
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-12-03 23:09:06 +03:00
|
|
|
enum zio_checksum checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp == NULL) {
|
|
|
|
/*
|
|
|
|
* This is zio_write_phys().
|
|
|
|
* We're either generating a label checksum, or none at all.
|
|
|
|
*/
|
|
|
|
checksum = zio->io_prop.zp_checksum;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (checksum == ZIO_CHECKSUM_OFF)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
|
|
|
|
ASSERT(checksum == ZIO_CHECKSUM_LABEL);
|
|
|
|
} else {
|
|
|
|
if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
|
|
|
|
ASSERT(!IO_IS_ALLOCATING(zio));
|
|
|
|
checksum = ZIO_CHECKSUM_GANG_HEADER;
|
|
|
|
} else {
|
|
|
|
checksum = BP_GET_CHECKSUM(bp);
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_verify(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_bad_cksum_t info;
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_vd != NULL);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp == NULL) {
|
|
|
|
/*
|
|
|
|
* This is zio_read_phys().
|
|
|
|
* We're either verifying a label checksum, or nothing at all.
|
|
|
|
*/
|
|
|
|
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((error = zio_checksum_error(zio, &info)) != 0) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_error = error;
|
2014-11-23 06:24:52 +03:00
|
|
|
if (error == ECKSUM &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_ereport_start_checksum(zio->io_spa,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio->io_vd, &zio->io_bookmark, zio,
|
|
|
|
zio->io_offset, zio->io_size, NULL, &info);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called by RAID-Z to ensure we don't compute the checksum twice.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zio_checksum_verified(zio_t *zio)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
|
2014-06-06 01:19:08 +04:00
|
|
|
* An error of 0 indicates success. ENXIO indicates whole-device failure,
|
2008-12-03 23:09:06 +03:00
|
|
|
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
|
|
|
|
* indicate errors that are specific to one I/O, and most likely permanent.
|
|
|
|
* Any other error is presumed to be worse because we weren't expecting it.
|
|
|
|
* ==========================================================================
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
int
|
|
|
|
zio_worst_error(int e1, int e2)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
|
|
|
|
int r1, r2;
|
|
|
|
|
|
|
|
for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
|
|
|
|
if (e1 == zio_error_rank[r1])
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
|
|
|
|
if (e2 == zio_error_rank[r2])
|
|
|
|
break;
|
|
|
|
|
|
|
|
return (r1 > r2 ? e1 : e2);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ==========================================================================
|
2008-12-03 23:09:06 +03:00
|
|
|
* I/O completion
|
2008-11-20 23:01:55 +03:00
|
|
|
* ==========================================================================
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_ready(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio, *pio_next;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
|
|
|
|
ZIO_WAIT_READY)) {
|
2009-07-03 02:44:48 +04:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (zio->io_ready) {
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
2013-05-10 23:47:54 +04:00
|
|
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
|
|
|
|
(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_ready(zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (bp != NULL && bp != &zio->io_bp_copy)
|
|
|
|
zio->io_bp_copy = *bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio->io_error != 0) {
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
|
|
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
/*
|
|
|
|
* We were unable to allocate anything, unreserve and
|
|
|
|
* issue the next I/O to allocate.
|
|
|
|
*/
|
|
|
|
metaslab_class_throttle_unreserve(
|
|
|
|
spa_normal_class(zio->io_spa),
|
|
|
|
zio->io_prop.zp_copies, zio);
|
|
|
|
zio_allocate_dispatch(zio->io_spa);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_READY] = 1;
|
2016-10-14 03:59:18 +03:00
|
|
|
pio = zio_walk_parents(zio, &zl);
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* As we notify zio's parents, new parents could be added.
|
|
|
|
* New parents go to the head of zio's io_parent_list, however,
|
|
|
|
* so we will (correctly) not notify them. The remainder of zio's
|
|
|
|
* io_parent_list, from 'pio_next' onward, cannot change because
|
|
|
|
* all parents must wait for us to be done before they can be done.
|
|
|
|
*/
|
|
|
|
for (; pio != NULL; pio = pio_next) {
|
2016-10-14 03:59:18 +03:00
|
|
|
pio_next = zio_walk_parents(zio, &zl);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NODATA) {
|
|
|
|
if (BP_IS_GANG(bp)) {
|
|
|
|
zio->io_flags &= ~ZIO_FLAG_NODATA;
|
|
|
|
} else {
|
2016-07-22 18:52:49 +03:00
|
|
|
ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zio_injection_enabled &&
|
|
|
|
zio->io_spa->spa_syncing_txg == zio->io_txg)
|
|
|
|
zio_handle_ignored_writes(zio);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_CONTINUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Update the allocation throttle accounting.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zio_dva_throttle_done(zio_t *zio)
|
|
|
|
{
|
2017-11-04 23:25:13 +03:00
|
|
|
ASSERTV(zio_t *lio = zio->io_logical);
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_t *pio = zio_unique_parent(zio);
|
|
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
int flags = METASLAB_ASYNC_ALLOC;
|
|
|
|
|
|
|
|
ASSERT3P(zio->io_bp, !=, NULL);
|
|
|
|
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
|
|
|
|
ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
|
|
|
|
ASSERT(vd != NULL);
|
|
|
|
ASSERT3P(vd, ==, vd->vdev_top);
|
2017-08-11 01:53:40 +03:00
|
|
|
ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
|
2016-10-14 03:59:18 +03:00
|
|
|
ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
|
|
|
|
ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
|
|
|
|
ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parents of gang children can have two flavors -- ones that
|
|
|
|
* allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
|
|
|
|
* and ones that allocated the constituent blocks. The allocation
|
|
|
|
* throttle needs to know the allocating parent zio so we must find
|
|
|
|
* it here.
|
|
|
|
*/
|
|
|
|
if (pio->io_child_type == ZIO_CHILD_GANG) {
|
|
|
|
/*
|
|
|
|
* If our parent is a rewrite gang child then our grandparent
|
|
|
|
* would have been the one that performed the allocation.
|
|
|
|
*/
|
|
|
|
if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
|
|
|
|
pio = zio_unique_parent(pio);
|
|
|
|
flags |= METASLAB_GANG_CHILD;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(IO_IS_ALLOCATING(pio));
|
|
|
|
ASSERT3P(zio, !=, zio->io_logical);
|
|
|
|
ASSERT(zio->io_logical != NULL);
|
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
|
|
|
|
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
|
|
|
|
|
|
|
|
mutex_enter(&pio->io_lock);
|
|
|
|
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
|
|
|
|
mutex_exit(&pio->io_lock);
|
|
|
|
|
|
|
|
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
|
|
|
|
1, pio);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Call into the pipeline to see if there is more work that
|
|
|
|
* needs to be done. If there is work to be done it will be
|
|
|
|
* dispatched to another taskq thread.
|
|
|
|
*/
|
|
|
|
zio_allocate_dispatch(zio->io_spa);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static int
|
|
|
|
zio_done(zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* Always attempt to keep stack usage minimal here since
|
|
|
|
* we can be called recurisvely up to 19 levels deep.
|
|
|
|
*/
|
2017-01-05 22:10:07 +03:00
|
|
|
const uint64_t psize = zio->io_size;
|
2009-02-18 23:51:31 +03:00
|
|
|
zio_t *pio, *pio_next;
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_link_t *zl = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* If our children haven't all completed,
|
2008-12-03 23:09:06 +03:00
|
|
|
* wait for them and then repeat this pipeline stage.
|
|
|
|
*/
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
OpenZFS 8857 - zio_remove_child() panic due to already destroyed parent zio
PROBLEM
=======
It's possible for a parent zio to complete even though it has children
which have not completed. This can result in the following panic:
> $C
ffffff01809128c0 vpanic()
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
ffffff3373370908)
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
ffffff0180912b40 thread_start+8()
> ::status
debugging crash dump vmcore.2 (64-bit) from batfs0390
operating system: 5.11 joyent_20170911T171900Z (i86pc)
image uuid: (not set)
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
owner=ffffff3c59b39480 thread=ffffff0180912c40
dump content: kernel pages only
The problem is that dbuf_prefetch along with l2arc can create a zio tree
which confuses the parent zio and allows it to complete with while children
still exist. Here's the scenario:
zio tree:
pio
|--- lio
The parent zio, pio, has entered the zio_done stage and begins to check its
children to see there are still some that have not completed. In zio_done(),
the children are checked in the following order:
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
If pio, finds any child which has not completed then it stops executing and
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
while checking the particular child.
In this scenario, the pio has completed the first call to
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
the only zio in the zio tree right now is the logical zio, lio, then it
completes that call and prepares to check the next child type.
In the meantime, the lio completes and in its callback creates a child vdev
zio, cio. The zio tree looks like this:
zio tree:
pio
|--- lio
|--- cio
The lio then grabs the parent's io_lock and removes itself.
zio tree:
pio
|--- cio
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
and will erroneously complete. When the child zio, cio, completes it will panic
the system trying to reference the parent zio which has been destroyed.
SOLUTION
========
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
for all the children types that it's interested in checking. The
io_lock will is held the entire time we check all the children types. Since
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
the zio_wiat_for_children logic.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/8857
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
Issue #5918
Closes #7168
2018-02-08 23:04:14 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
/*
|
|
|
|
* If the allocation throttle is enabled, then update the accounting.
|
|
|
|
* We only track child I/Os that are part of an allocating async
|
|
|
|
* write. We must do this since the allocation is performed
|
|
|
|
* by the logical I/O but the actual write is done by child I/Os.
|
|
|
|
*/
|
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
|
|
|
|
zio->io_child_type == ZIO_CHILD_VDEV) {
|
|
|
|
ASSERT(spa_normal_class(
|
|
|
|
zio->io_spa)->mc_alloc_throttle_enabled);
|
|
|
|
zio_dva_throttle_done(zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the allocation throttle is enabled, verify that
|
|
|
|
* we have decremented the refcounts for every I/O that was throttled.
|
|
|
|
*/
|
|
|
|
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
|
|
|
|
ASSERT(zio->io_bp != NULL);
|
|
|
|
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio);
|
|
|
|
VERIFY(refcount_not_held(
|
|
|
|
&(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-11-04 23:25:13 +03:00
|
|
|
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
|
|
|
|
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_children[c][w] == 0);
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(zio->io_bp->blk_pad[0] == 0);
|
|
|
|
ASSERT(zio->io_bp->blk_pad[1] == 0);
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
|
|
|
|
sizeof (blkptr_t)) == 0 ||
|
2010-08-26 22:04:17 +04:00
|
|
|
(zio->io_bp == zio_unique_parent(zio)->io_bp));
|
|
|
|
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
zio->io_bp_override == NULL &&
|
2008-12-03 23:09:06 +03:00
|
|
|
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
|
2013-11-01 23:26:11 +04:00
|
|
|
ASSERT3U(zio->io_prop.zp_copies, <=,
|
|
|
|
BP_GET_NDVAS(zio->io_bp));
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
|
2013-11-01 23:26:11 +04:00
|
|
|
(BP_COUNT_GANG(zio->io_bp) ==
|
|
|
|
BP_GET_NDVAS(zio->io_bp)));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2013-05-10 23:47:54 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_NOPWRITE)
|
|
|
|
VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* If there were child vdev/gang/ddt errors, they apply to us now.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
|
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the I/O on the transformed data was successful, generate any
|
|
|
|
* checksum reports now while we still have the transformed data.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == 0) {
|
|
|
|
while (zio->io_cksum_report != NULL) {
|
|
|
|
zio_cksum_report_t *zcr = zio->io_cksum_report;
|
|
|
|
uint64_t align = zcr->zcr_align;
|
2016-07-22 18:52:49 +03:00
|
|
|
uint64_t asize = P2ROUNDUP(psize, align);
|
|
|
|
abd_t *adata = zio->io_abd;
|
|
|
|
|
|
|
|
if (asize != psize) {
|
2017-01-05 22:10:07 +03:00
|
|
|
adata = abd_alloc(asize, B_TRUE);
|
2016-07-22 18:52:49 +03:00
|
|
|
abd_copy(adata, zio->io_abd, psize);
|
|
|
|
abd_zero_off(adata, psize, asize - psize);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
zio->io_cksum_report = zcr->zcr_next;
|
|
|
|
zcr->zcr_next = NULL;
|
2017-01-05 22:10:07 +03:00
|
|
|
zcr->zcr_finish(zcr, adata);
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_ereport_free_checksum(zcr);
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
if (asize != psize)
|
|
|
|
abd_free(adata);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
zio_pop_transforms(zio); /* note: may set zio->io_error */
|
|
|
|
|
2016-07-22 18:52:49 +03:00
|
|
|
vdev_stat_update(zio, psize);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-10-02 03:54:52 +04:00
|
|
|
/*
|
2013-04-30 02:49:23 +04:00
|
|
|
* If this I/O is attached to a particular vdev is slow, exceeding
|
2012-12-21 06:15:34 +04:00
|
|
|
* 30 seconds to complete, post an error described the I/O delay.
|
|
|
|
* We ignore these errors if the device is currently unavailable.
|
2010-10-02 03:54:52 +04:00
|
|
|
*/
|
2016-02-29 21:05:23 +03:00
|
|
|
if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
|
2012-12-21 06:15:34 +04:00
|
|
|
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
|
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
|
2012-12-21 06:15:34 +04:00
|
|
|
}
|
2010-10-02 03:54:52 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error) {
|
|
|
|
/*
|
|
|
|
* If this I/O is attached to a particular vdev,
|
|
|
|
* generate an error message describing the I/O failure
|
|
|
|
* at the block level. We ignore these errors if the
|
|
|
|
* device is currently unavailable.
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
|
2016-12-12 21:46:26 +03:00
|
|
|
!vdev_is_dead(zio->io_vd))
|
2010-08-26 22:04:17 +04:00
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zio->io_error == EIO || !(zio->io_flags &
|
|
|
|
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
|
2010-08-26 22:04:17 +04:00
|
|
|
zio == zio->io_logical) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* For logical I/O requests, tell the SPA to log the
|
|
|
|
* error and generate a logical data ereport.
|
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
spa_log_error(zio->io_spa, &zio->io_bookmark);
|
2013-11-01 23:26:11 +04:00
|
|
|
zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
NULL, &zio->io_bookmark, zio, 0, 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:04:17 +04:00
|
|
|
if (zio->io_error && zio == zio->io_logical) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Determine whether zio should be reexecuted. This will
|
|
|
|
* propagate all the way to the root via zio_notify_parent().
|
|
|
|
*/
|
2010-08-26 22:04:17 +04:00
|
|
|
ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (IO_IS_ALLOCATING(zio) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_error != ENOSPC)
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
|
|
|
|
else
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zio->io_type == ZIO_TYPE_READ ||
|
|
|
|
zio->io_type == ZIO_TYPE_FREE) &&
|
2010-08-27 01:24:34 +04:00
|
|
|
!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_error == ENXIO &&
|
2010-08-26 22:04:17 +04:00
|
|
|
spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
|
|
|
|
spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
|
|
|
|
|
|
|
if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
|
|
|
|
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here is a possibly good place to attempt to do
|
|
|
|
* either combinatorial reconstruction or error correction
|
|
|
|
* based on checksums. It also might be a good place
|
|
|
|
* to send out preliminary ereports before we suspend
|
|
|
|
* processing.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* If there were logical child errors, they apply to us now.
|
|
|
|
* We defer this until now to avoid conflating logical child
|
|
|
|
* errors with errors that happened to the zio itself when
|
|
|
|
* updating vdev stats and reporting FMA events above.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zio->io_error || zio->io_reexecute) &&
|
|
|
|
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
|
2013-05-10 23:47:54 +04:00
|
|
|
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
|
2010-08-26 22:04:17 +04:00
|
|
|
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
zio_gang_tree_free(&zio->io_gang_tree);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Godfather I/Os should never suspend.
|
|
|
|
*/
|
|
|
|
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
|
|
|
|
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
|
2017-01-28 23:13:34 +03:00
|
|
|
zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_reexecute) {
|
|
|
|
/*
|
|
|
|
* This is a logical I/O that wants to reexecute.
|
|
|
|
*
|
|
|
|
* Reexecute is top-down. When an i/o fails, if it's not
|
|
|
|
* the root, it simply notifies its parent and sticks around.
|
|
|
|
* The parent, seeing that it still has children in zio_done(),
|
|
|
|
* does the same. This percolates all the way up to the root.
|
|
|
|
* The root i/o will reexecute or suspend the entire tree.
|
|
|
|
*
|
|
|
|
* This approach ensures that zio_reexecute() honors
|
|
|
|
* all the original i/o dependency relationships, e.g.
|
|
|
|
* parents not executing until children are ready.
|
|
|
|
*/
|
|
|
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zio->io_gang_leader = NULL;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = 1;
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* "The Godfather" I/O monitors its children but is
|
|
|
|
* not a true parent to them. It will track them through
|
|
|
|
* the pipeline but severs its ties whenever they get into
|
|
|
|
* trouble (e.g. suspended). This allows "The Godfather"
|
|
|
|
* I/O to return status without blocking.
|
|
|
|
*/
|
2016-10-14 03:59:18 +03:00
|
|
|
zl = NULL;
|
|
|
|
for (pio = zio_walk_parents(zio, &zl); pio != NULL;
|
|
|
|
pio = pio_next) {
|
|
|
|
zio_link_t *remove_zl = zl;
|
|
|
|
pio_next = zio_walk_parents(zio, &zl);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
|
|
|
|
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_remove_child(pio, zio, remove_zl);
|
2009-07-03 02:44:48 +04:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
if ((pio = zio_unique_parent(zio)) != NULL) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We're not a root i/o, so there's nothing to do
|
|
|
|
* but notify our parent. Don't propagate errors
|
|
|
|
* upward since we haven't permanently failed yet.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
|
2008-12-03 23:09:06 +03:00
|
|
|
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
|
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
|
|
|
|
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
|
|
|
|
/*
|
|
|
|
* We'd fail again if we reexecuted now, so suspend
|
|
|
|
* until conditions improve (e.g. device comes online).
|
|
|
|
*/
|
2018-03-15 20:56:55 +03:00
|
|
|
zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Reexecution is potentially a huge amount of work.
|
|
|
|
* Hand it off to the otherwise-unused claim taskq.
|
|
|
|
*/
|
2011-11-08 04:26:52 +04:00
|
|
|
ASSERT(taskq_empty_ent(&zio->io_tqent));
|
2013-05-06 23:24:30 +04:00
|
|
|
spa_taskq_dispatch_ent(zio->io_spa,
|
|
|
|
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
|
2011-11-08 04:26:52 +04:00
|
|
|
(task_func_t *)zio_reexecute, zio, 0,
|
|
|
|
&zio->io_tqent);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(zio->io_child_count == 0);
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(zio->io_reexecute == 0);
|
|
|
|
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Report any checksum errors, since the I/O is complete.
|
|
|
|
*/
|
|
|
|
while (zio->io_cksum_report != NULL) {
|
|
|
|
zio_cksum_report_t *zcr = zio->io_cksum_report;
|
|
|
|
zio->io_cksum_report = zcr->zcr_next;
|
|
|
|
zcr->zcr_next = NULL;
|
|
|
|
zcr->zcr_finish(zcr, NULL);
|
|
|
|
zfs_ereport_free_checksum(zcr);
|
|
|
|
}
|
|
|
|
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
|
2014-06-06 01:19:08 +04:00
|
|
|
!BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
|
|
|
|
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
|
Add FASTWRITE algorithm for synchronous writes.
Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:
1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;
2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;
3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.
The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.
This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.
The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.
metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().
ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.
A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.
The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
2012-06-27 17:20:20 +04:00
|
|
|
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
|
|
|
|
}
|
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
/*
|
|
|
|
* It is the responsibility of the done callback to ensure that this
|
|
|
|
* particular zio is no longer discoverable for adoption, and as
|
|
|
|
* such, cannot acquire any new parents.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_done)
|
|
|
|
zio->io_done(zio);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_state[ZIO_WAIT_DONE] = 1;
|
|
|
|
mutex_exit(&zio->io_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-14 03:59:18 +03:00
|
|
|
zl = NULL;
|
|
|
|
for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
|
|
|
|
zio_link_t *remove_zl = zl;
|
|
|
|
pio_next = zio_walk_parents(zio, &zl);
|
|
|
|
zio_remove_child(pio, zio, remove_zl);
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zio->io_waiter != NULL) {
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
|
|
zio->io_executor = NULL;
|
|
|
|
cv_broadcast(&zio->io_cv);
|
|
|
|
mutex_exit(&zio->io_lock);
|
|
|
|
} else {
|
|
|
|
zio_destroy(zio);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ZIO_PIPELINE_STOP);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* ==========================================================================
|
|
|
|
* I/O pipeline definition
|
|
|
|
* ==========================================================================
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static zio_pipe_stage_t *zio_pipeline[] = {
|
2008-12-03 23:09:06 +03:00
|
|
|
NULL,
|
|
|
|
zio_read_bp_init,
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_write_bp_init,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_free_bp_init,
|
|
|
|
zio_issue_async,
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_write_compress,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
zio_encrypt,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_checksum_generate,
|
2013-05-10 23:47:54 +04:00
|
|
|
zio_nop_write,
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_ddt_read_start,
|
|
|
|
zio_ddt_read_done,
|
|
|
|
zio_ddt_write,
|
|
|
|
zio_ddt_free,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_gang_assemble,
|
|
|
|
zio_gang_issue,
|
2016-10-14 03:59:18 +03:00
|
|
|
zio_dva_throttle,
|
2008-12-03 23:09:06 +03:00
|
|
|
zio_dva_allocate,
|
|
|
|
zio_dva_free,
|
|
|
|
zio_dva_claim,
|
|
|
|
zio_ready,
|
|
|
|
zio_vdev_io_start,
|
|
|
|
zio_vdev_io_done,
|
|
|
|
zio_vdev_io_assess,
|
|
|
|
zio_checksum_verify,
|
|
|
|
zio_done
|
|
|
|
};
|
2010-08-26 22:49:16 +04:00
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
/*
|
|
|
|
* Compare two zbookmark_phys_t's to see which we would reach first in a
|
|
|
|
* pre-order traversal of the object tree.
|
|
|
|
*
|
|
|
|
* This is simple in every case aside from the meta-dnode object. For all other
|
|
|
|
* objects, we traverse them in order (object 1 before object 2, and so on).
|
|
|
|
* However, all of these objects are traversed while traversing object 0, since
|
|
|
|
* the data it points to is the list of objects. Thus, we need to convert to a
|
|
|
|
* canonical representation so we can compare meta-dnode bookmarks to
|
|
|
|
* non-meta-dnode bookmarks.
|
|
|
|
*
|
|
|
|
* We do this by calculating "equivalents" for each field of the zbookmark.
|
|
|
|
* zbookmarks outside of the meta-dnode use their own object and level, and
|
|
|
|
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
|
|
|
|
* blocks this bookmark refers to) by multiplying their blkid by their span
|
|
|
|
* (the number of L0 blocks contained within one block at their level).
|
|
|
|
* zbookmarks inside the meta-dnode calculate their object equivalent
|
|
|
|
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
|
|
|
|
* level + 1<<31 (any value larger than a level could ever be) for their level.
|
|
|
|
* This causes them to always compare before a bookmark in their object
|
|
|
|
* equivalent, compare appropriately to bookmarks in other objects, and to
|
|
|
|
* compare appropriately to other bookmarks in the meta-dnode.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
|
|
|
|
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* These variables represent the "equivalent" values for the zbookmark,
|
|
|
|
* after converting zbookmarks inside the meta dnode to their
|
|
|
|
* normal-object equivalents.
|
|
|
|
*/
|
|
|
|
uint64_t zb1obj, zb2obj;
|
|
|
|
uint64_t zb1L0, zb2L0;
|
|
|
|
uint64_t zb1level, zb2level;
|
|
|
|
|
|
|
|
if (zb1->zb_object == zb2->zb_object &&
|
|
|
|
zb1->zb_level == zb2->zb_level &&
|
|
|
|
zb1->zb_blkid == zb2->zb_blkid)
|
|
|
|
return (0);
|
2012-12-14 03:24:15 +04:00
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
/*
|
|
|
|
* BP_SPANB calculates the span in blocks.
|
|
|
|
*/
|
|
|
|
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
|
|
|
|
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
|
2012-12-14 03:24:15 +04:00
|
|
|
|
|
|
|
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
|
2015-12-22 04:31:57 +03:00
|
|
|
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
|
|
|
zb1L0 = 0;
|
|
|
|
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
|
|
|
|
} else {
|
|
|
|
zb1obj = zb1->zb_object;
|
|
|
|
zb1level = zb1->zb_level;
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
|
|
|
|
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
|
|
|
|
zb2L0 = 0;
|
|
|
|
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
|
|
|
|
} else {
|
|
|
|
zb2obj = zb2->zb_object;
|
|
|
|
zb2level = zb2->zb_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now that we have a canonical representation, do the comparison. */
|
|
|
|
if (zb1obj != zb2obj)
|
|
|
|
return (zb1obj < zb2obj ? -1 : 1);
|
|
|
|
else if (zb1L0 != zb2L0)
|
|
|
|
return (zb1L0 < zb2L0 ? -1 : 1);
|
|
|
|
else if (zb1level != zb2level)
|
|
|
|
return (zb1level > zb2level ? -1 : 1);
|
|
|
|
/*
|
|
|
|
* This can (theoretically) happen if the bookmarks have the same object
|
|
|
|
* and level, but different blkids, if the block sizes are not the same.
|
|
|
|
* There is presently no way to change the indirect block sizes
|
|
|
|
*/
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function checks the following: given that last_block is the place that
|
|
|
|
* our traversal stopped last time, does that guarantee that we've visited
|
|
|
|
* every node under subtree_root? Therefore, we can't just use the raw output
|
|
|
|
* of zbookmark_compare. We have to pass in a modified version of
|
|
|
|
* subtree_root; by incrementing the block id, and then checking whether
|
|
|
|
* last_block is before or equal to that, we can tell whether or not having
|
|
|
|
* visited last_block implies that all of subtree_root's children have been
|
|
|
|
* visited.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zbookmark_subtree_completed(const dnode_phys_t *dnp,
|
|
|
|
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
|
|
|
|
{
|
|
|
|
zbookmark_phys_t mod_zb = *subtree_root;
|
|
|
|
mod_zb.zb_blkid++;
|
|
|
|
ASSERT(last_block->zb_level == 0);
|
|
|
|
|
|
|
|
/* The objset_phys_t isn't before anything. */
|
|
|
|
if (dnp == NULL)
|
2012-12-14 03:24:15 +04:00
|
|
|
return (B_FALSE);
|
2015-12-22 04:31:57 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
|
|
|
|
* data block size in sectors, because that variable is only used if
|
|
|
|
* the bookmark refers to a block in the meta-dnode. Since we don't
|
|
|
|
* know without examining it what object it refers to, and there's no
|
|
|
|
* harm in passing in this value in other cases, we always pass it in.
|
|
|
|
*
|
|
|
|
* We pass in 0 for the indirect block size shift because zb2 must be
|
|
|
|
* level 0. The indirect block size is only used to calculate the span
|
|
|
|
* of the bookmark, but since the bookmark must be level 0, the span is
|
|
|
|
* always 1, so the math works out.
|
|
|
|
*
|
|
|
|
* If you make changes to how the zbookmark_compare code works, be sure
|
|
|
|
* to make sure that this code still works afterwards.
|
|
|
|
*/
|
|
|
|
return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
|
|
|
|
1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
|
|
|
|
last_block) <= 0);
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:49:16 +04:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
|
|
|
EXPORT_SYMBOL(zio_type_name);
|
2014-12-16 22:44:24 +03:00
|
|
|
EXPORT_SYMBOL(zio_buf_alloc);
|
|
|
|
EXPORT_SYMBOL(zio_data_buf_alloc);
|
|
|
|
EXPORT_SYMBOL(zio_buf_free);
|
|
|
|
EXPORT_SYMBOL(zio_data_buf_free);
|
2010-08-26 22:49:16 +04:00
|
|
|
|
2010-10-02 03:54:52 +04:00
|
|
|
module_param(zio_delay_max, int, 0644);
|
2011-05-04 02:09:28 +04:00
|
|
|
MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
|
|
|
|
|
|
|
|
module_param(zio_requeue_io_start_cut_in_line, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
|
|
|
module_param(zfs_sync_pass_deferred_free, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Defer frees starting in this pass");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
|
|
|
module_param(zfs_sync_pass_dont_compress, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Don't compress starting in this pass");
|
2013-06-04 13:25:22 +04:00
|
|
|
|
|
|
|
module_param(zfs_sync_pass_rewrite, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_sync_pass_rewrite,
|
2013-11-01 23:26:11 +04:00
|
|
|
"Rewrite new bps starting in this pass");
|
2016-10-14 03:59:18 +03:00
|
|
|
|
|
|
|
module_param(zio_dva_throttle_enabled, int, 0644);
|
|
|
|
MODULE_PARM_DESC(zio_dva_throttle_enabled,
|
|
|
|
"Throttle block allocations in the ZIO pipeline");
|
2010-08-26 22:49:16 +04:00
|
|
|
#endif
|