2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2008-11-20 23:01:55 +03:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
2009-08-18 22:43:27 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2024-03-29 22:15:56 +03:00
|
|
|
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
|
2011-11-12 02:07:54 +04:00
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
2017-05-30 21:39:17 +03:00
|
|
|
* Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
|
|
|
|
* Copyright (c) 2014 Integros [integros.com]
|
2018-09-06 04:33:36 +03:00
|
|
|
* Copyright (c) 2017, Intel Corporation.
|
2019-03-12 23:13:22 +03:00
|
|
|
* Copyright (c) 2019 Datto Inc.
|
2021-02-18 08:30:45 +03:00
|
|
|
* Portions Copyright 2010 Robert Milkowski
|
|
|
|
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
|
2022-10-20 03:07:51 +03:00
|
|
|
* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
|
feature: large_microzap
In a4b21eadec we added the zap_micro_max_size tuneable to raise the size
at which "micro" (single-block) ZAPs are upgraded to "fat" (multi-block)
ZAPs. Before this, a microZAP was limited to 128KiB, which was the old
largest block size. The side effect of raising the max size past 128KiB
is that it be stored in a large block, requiring the large_blocks
feature.
Unfortunately, this means that a backup stream created without the
--large-block (-L) flag to zfs send would split the microZAP block into
smaller blocks and send those, as is normal behaviour for large blocks.
This would be received correctly, but since microZAPs are limited to the
first block in the object by definition, the entries in the later blocks
would be inaccessible. For directory ZAPs, this gives the appearance of
files being lost.
This commit adds a feature flag, large_microzap, that must be enabled
for microZAPs to grow beyond 128KiB, and which will be activated the
first time that occurs. This feature is later checked when generating
the stream and if active, the send operation will abort unless
--large-block has also been requested.
Changing the limit still requires zap_micro_max_size to be changed. The
state of this flag effectively sets the upper value for this tuneable,
that is, if the feature is disabled, the tuneable will be clamped to
128KiB.
A stream flag is also added to ensure that the receiver also activates
its own feature flag upon receiving the stream. This is not strictly
necessary to _use_ the received microZAP, since it doesn't care how
large its block is, but it is required to send the microZAP object on,
otherwise the original problem occurs again.
Because it's difficult to reliably distinguish a microZAP from a fatZAP
from outside the ZAP code, and because it seems unlikely that most
users are affected (a fairly niche tuneable combined with what should be
an uncommon use of send), and for the sake of expediency, this change
activates the feature the first time a microZAP grows to use a large
block, and is never deactivated after that. This can be improved in the
future.
This commit changes nothing for existing pools that already have large
microZAPs. The feature will not be retroactively applied, but will be
activated the next time a microZAP grows past the limit.
Don't use large_blocks feature for enable/disable tests. The
large_microzap depends on large_blocks, so it gets enabled as a
dependency, breaking the test. Instead use feature "longname", which has
the exact same feature characteristics.
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16593
2024-10-03 06:47:11 +03:00
|
|
|
* Copyright (c) 2024, Klara, Inc.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_FS_ZFS_H
|
2021-05-15 12:53:14 +03:00
|
|
|
#define _SYS_FS_ZFS_H extern __attribute__((visibility("default")))
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-08-18 22:43:27 +04:00
|
|
|
#include <sys/time.h>
|
2016-02-29 21:05:23 +03:00
|
|
|
#include <sys/zio_priority.h>
|
2009-08-18 22:43:27 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Types and constants shared between userland and the kernel.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each dataset can be one of the following types. These constants can be
|
|
|
|
* combined into masks that can be passed to various functions.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
2022-02-21 06:20:00 +03:00
|
|
|
ZFS_TYPE_INVALID = 0,
|
2013-12-12 02:33:41 +04:00
|
|
|
ZFS_TYPE_FILESYSTEM = (1 << 0),
|
|
|
|
ZFS_TYPE_SNAPSHOT = (1 << 1),
|
|
|
|
ZFS_TYPE_VOLUME = (1 << 2),
|
|
|
|
ZFS_TYPE_POOL = (1 << 3),
|
2021-11-30 17:46:25 +03:00
|
|
|
ZFS_TYPE_BOOKMARK = (1 << 4),
|
|
|
|
ZFS_TYPE_VDEV = (1 << 5),
|
2008-11-20 23:01:55 +03:00
|
|
|
} zfs_type_t;
|
|
|
|
|
2017-01-23 20:49:57 +03:00
|
|
|
/*
|
|
|
|
* NB: lzc_dataset_type should be updated whenever a new objset type is added,
|
|
|
|
* if it represents a real type of a dataset that can be created from userland.
|
|
|
|
*/
|
2013-08-28 15:45:09 +04:00
|
|
|
typedef enum dmu_objset_type {
|
|
|
|
DMU_OST_NONE,
|
|
|
|
DMU_OST_META,
|
|
|
|
DMU_OST_ZFS,
|
|
|
|
DMU_OST_ZVOL,
|
|
|
|
DMU_OST_OTHER, /* For testing only! */
|
|
|
|
DMU_OST_ANY, /* Be careful! */
|
|
|
|
DMU_OST_NUMTYPES
|
|
|
|
} dmu_objset_type_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZFS_TYPE_DATASET \
|
|
|
|
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
|
|
|
|
|
2016-06-16 00:28:36 +03:00
|
|
|
/*
|
|
|
|
* All of these include the terminating NUL byte.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZAP_MAXNAMELEN 256
|
2021-06-18 11:55:01 +03:00
|
|
|
#define ZAP_MAXNAMELEN_NEW 1024
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZAP_MAXVALUELEN (1024 * 8)
|
|
|
|
#define ZAP_OLDMAXVALUELEN 1024
|
2016-06-16 00:28:36 +03:00
|
|
|
#define ZFS_MAX_DATASET_NAME_LEN 256
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Dataset properties are identified by these constants and must be added to
|
|
|
|
* the end of this list to ensure that external consumers are not affected
|
|
|
|
* by the change. If you make any changes to this list, be sure to update
|
2013-07-03 05:55:16 +04:00
|
|
|
* the property table in module/zcommon/zfs_prop.c.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
typedef enum {
|
2018-01-19 20:22:37 +03:00
|
|
|
ZPROP_CONT = -2,
|
|
|
|
ZPROP_INVAL = -1,
|
2022-06-14 21:27:53 +03:00
|
|
|
ZPROP_USERPROP = ZPROP_INVAL,
|
2017-01-18 01:45:02 +03:00
|
|
|
ZFS_PROP_TYPE = 0,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_CREATION,
|
|
|
|
ZFS_PROP_USED,
|
|
|
|
ZFS_PROP_AVAILABLE,
|
|
|
|
ZFS_PROP_REFERENCED,
|
|
|
|
ZFS_PROP_COMPRESSRATIO,
|
|
|
|
ZFS_PROP_MOUNTED,
|
|
|
|
ZFS_PROP_ORIGIN,
|
|
|
|
ZFS_PROP_QUOTA,
|
|
|
|
ZFS_PROP_RESERVATION,
|
|
|
|
ZFS_PROP_VOLSIZE,
|
|
|
|
ZFS_PROP_VOLBLOCKSIZE,
|
|
|
|
ZFS_PROP_RECORDSIZE,
|
|
|
|
ZFS_PROP_MOUNTPOINT,
|
|
|
|
ZFS_PROP_SHARENFS,
|
|
|
|
ZFS_PROP_CHECKSUM,
|
|
|
|
ZFS_PROP_COMPRESSION,
|
|
|
|
ZFS_PROP_ATIME,
|
|
|
|
ZFS_PROP_DEVICES,
|
|
|
|
ZFS_PROP_EXEC,
|
|
|
|
ZFS_PROP_SETUID,
|
|
|
|
ZFS_PROP_READONLY,
|
|
|
|
ZFS_PROP_ZONED,
|
|
|
|
ZFS_PROP_SNAPDIR,
|
2020-02-04 19:40:08 +03:00
|
|
|
ZFS_PROP_ACLMODE,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_ACLINHERIT,
|
2017-05-10 01:36:53 +03:00
|
|
|
ZFS_PROP_CREATETXG,
|
2019-07-06 02:38:17 +03:00
|
|
|
ZFS_PROP_NAME, /* not exposed to the user */
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_CANMOUNT,
|
2019-07-06 02:38:17 +03:00
|
|
|
ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_XATTR,
|
2019-07-06 02:38:17 +03:00
|
|
|
ZFS_PROP_NUMCLONES, /* not exposed to the user */
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_COPIES,
|
|
|
|
ZFS_PROP_VERSION,
|
|
|
|
ZFS_PROP_UTF8ONLY,
|
|
|
|
ZFS_PROP_NORMALIZE,
|
|
|
|
ZFS_PROP_CASE,
|
|
|
|
ZFS_PROP_VSCAN,
|
|
|
|
ZFS_PROP_NBMAND,
|
|
|
|
ZFS_PROP_SHARESMB,
|
|
|
|
ZFS_PROP_REFQUOTA,
|
|
|
|
ZFS_PROP_REFRESERVATION,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZFS_PROP_GUID,
|
|
|
|
ZFS_PROP_PRIMARYCACHE,
|
|
|
|
ZFS_PROP_SECONDARYCACHE,
|
|
|
|
ZFS_PROP_USEDSNAP,
|
|
|
|
ZFS_PROP_USEDDS,
|
|
|
|
ZFS_PROP_USEDCHILD,
|
|
|
|
ZFS_PROP_USEDREFRESERV,
|
2019-07-06 02:38:17 +03:00
|
|
|
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
|
|
|
|
ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
|
2009-08-18 22:43:27 +04:00
|
|
|
ZFS_PROP_DEFER_DESTROY,
|
|
|
|
ZFS_PROP_USERREFS,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_PROP_LOGBIAS,
|
2019-07-06 02:38:17 +03:00
|
|
|
ZFS_PROP_UNIQUE, /* not exposed to the user */
|
2018-08-20 19:52:37 +03:00
|
|
|
ZFS_PROP_OBJSETID,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_PROP_DEDUP,
|
|
|
|
ZFS_PROP_MLSLABEL,
|
|
|
|
ZFS_PROP_SYNC,
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
ZFS_PROP_DNODESIZE,
|
2011-07-26 23:23:00 +04:00
|
|
|
ZFS_PROP_REFRATIO,
|
2011-11-17 22:14:36 +04:00
|
|
|
ZFS_PROP_WRITTEN,
|
|
|
|
ZFS_PROP_CLONES,
|
2013-02-22 13:23:09 +04:00
|
|
|
ZFS_PROP_LOGICALUSED,
|
|
|
|
ZFS_PROP_LOGICALREFERENCED,
|
2019-07-06 02:38:17 +03:00
|
|
|
ZFS_PROP_INCONSISTENT, /* not exposed to the user */
|
2017-07-12 23:05:37 +03:00
|
|
|
ZFS_PROP_VOLMODE,
|
2015-04-01 16:07:48 +03:00
|
|
|
ZFS_PROP_FILESYSTEM_LIMIT,
|
|
|
|
ZFS_PROP_SNAPSHOT_LIMIT,
|
|
|
|
ZFS_PROP_FILESYSTEM_COUNT,
|
|
|
|
ZFS_PROP_SNAPSHOT_COUNT,
|
2013-02-14 03:11:59 +04:00
|
|
|
ZFS_PROP_SNAPDEV,
|
2013-10-28 20:22:15 +04:00
|
|
|
ZFS_PROP_ACLTYPE,
|
2013-12-19 10:24:14 +04:00
|
|
|
ZFS_PROP_SELINUX_CONTEXT,
|
|
|
|
ZFS_PROP_SELINUX_FSCONTEXT,
|
|
|
|
ZFS_PROP_SELINUX_DEFCONTEXT,
|
|
|
|
ZFS_PROP_SELINUX_ROOTCONTEXT,
|
2014-01-18 23:00:53 +04:00
|
|
|
ZFS_PROP_RELATIME,
|
2014-05-23 20:21:07 +04:00
|
|
|
ZFS_PROP_REDUNDANT_METADATA,
|
2014-07-25 14:42:00 +04:00
|
|
|
ZFS_PROP_OVERLAY,
|
2016-01-01 16:15:31 +03:00
|
|
|
ZFS_PROP_PREV_SNAP,
|
2016-01-07 00:22:48 +03:00
|
|
|
ZFS_PROP_RECEIVE_RESUME_TOKEN,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
ZFS_PROP_ENCRYPTION,
|
|
|
|
ZFS_PROP_KEYLOCATION,
|
|
|
|
ZFS_PROP_KEYFORMAT,
|
|
|
|
ZFS_PROP_PBKDF2_SALT,
|
|
|
|
ZFS_PROP_PBKDF2_ITERS,
|
|
|
|
ZFS_PROP_ENCRYPTION_ROOT,
|
|
|
|
ZFS_PROP_KEY_GUID,
|
|
|
|
ZFS_PROP_KEYSTATUS,
|
2019-06-25 02:44:01 +03:00
|
|
|
ZFS_PROP_REMAPTXG, /* obsolete - no longer used */
|
2018-09-06 04:33:36 +03:00
|
|
|
ZFS_PROP_SPECIAL_SMALL_BLOCKS,
|
2019-02-04 22:24:55 +03:00
|
|
|
ZFS_PROP_IVSET_GUID, /* not exposed to the user */
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
ZFS_PROP_REDACTED,
|
|
|
|
ZFS_PROP_REDACT_SNAPS,
|
2022-08-03 02:45:30 +03:00
|
|
|
ZFS_PROP_SNAPSHOTS_CHANGED,
|
2023-10-24 21:00:07 +03:00
|
|
|
ZFS_PROP_PREFETCH,
|
2023-10-25 00:53:27 +03:00
|
|
|
ZFS_PROP_VOLTHREADING,
|
Adding Direct IO Support
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads.
O_DIRECT support in ZFS will always ensure there is coherency between
buffered and O_DIRECT IO requests. This ensures that all IO requests,
whether buffered or direct, will see the same file contents at all
times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While
data is written directly to VDEV disks, metadata will not be synced
until the associated TXG is synced.
For both O_DIRECT read and write request the offset and request sizes,
at a minimum, must be PAGE_SIZE aligned. In the event they are not,
then EINVAL is returned unless the direct property is set to always (see
below).
For O_DIRECT writes:
The request also must be block aligned (recordsize) or the write
request will take the normal (buffered) write path. In the event that
request is block aligned and a cached copy of the buffer in the ARC,
then it will be discarded from the ARC forcing all further reads to
retrieve the data from disk.
For O_DIRECT reads:
The only alignment restrictions are PAGE_SIZE alignment. In the event
that the requested data is in buffered (in the ARC) it will just be
copied from the ARC into the user buffer.
For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in
the event that file contents are mmap'ed. In this case, all requests
that are at least PAGE_SIZE aligned will just fall back to the buffered
paths. If the request however is not PAGE_SIZE aligned, EINVAL will
be returned as always regardless if the file's contents are mmap'ed.
Since O_DIRECT writes go through the normal ZIO pipeline, the
following operations are supported just as with normal buffered writes:
Checksum
Compression
Encryption
Erasure Coding
There is one caveat for the data integrity of O_DIRECT writes that is
distinct for each of the OS's supported by ZFS.
FreeBSD - FreeBSD is able to place user pages under write protection so
any data in the user buffers and written directly down to the
VDEV disks is guaranteed to not change. There is no concern
with data integrity and O_DIRECT writes.
Linux - Linux is not able to place anonymous user pages under write
protection. Because of this, if the user decides to manipulate
the page contents while the write operation is occurring, data
integrity can not be guaranteed. However, there is a module
parameter `zfs_vdev_direct_write_verify` that controls the
if a O_DIRECT writes that can occur to a top-level VDEV before
a checksum verify is run before the contents of the I/O buffer
are committed to disk. In the event of a checksum verification
failure the write will return EIO. The number of O_DIRECT write
checksum verification errors can be observed by doing
`zpool status -d`, which will list all verification errors that
have occurred on a top-level VDEV. Along with `zpool status`, a
ZED event will be issues as `dio_verify` when a checksum
verification error occurs.
ZVOLs and dedup is not currently supported with Direct I/O.
A new dataset property `direct` has been added with the following 3
allowable values:
disabled - Accepts O_DIRECT flag, but silently ignores it and treats
the request as a buffered IO request.
standard - Follows the alignment restrictions outlined above for
write/read IO requests when the O_DIRECT flag is used.
always - Treats every write/read IO request as though it passed
O_DIRECT and will do O_DIRECT if the alignment restrictions
are met otherwise will redirect through the ARC. This
property will not allow a request to fail.
There is also a module parameter zfs_dio_enabled that can be used to
force all reads and writes through the ARC. By setting this module
parameter to 0, it mimics as if the direct dataset property is set to
disabled.
Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Co-authored-by: Mark Maybee <mark.maybee@delphix.com>
Co-authored-by: Matt Macy <mmacy@FreeBSD.org>
Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov>
Closes #10018
2024-09-14 23:47:59 +03:00
|
|
|
ZFS_PROP_DIRECT,
|
2021-06-18 11:55:01 +03:00
|
|
|
ZFS_PROP_LONGNAME,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_NUM_PROPS
|
|
|
|
} zfs_prop_t;
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_PROP_USERUSED,
|
|
|
|
ZFS_PROP_USERQUOTA,
|
|
|
|
ZFS_PROP_GROUPUSED,
|
|
|
|
ZFS_PROP_GROUPQUOTA,
|
2016-10-04 21:46:10 +03:00
|
|
|
ZFS_PROP_USEROBJUSED,
|
|
|
|
ZFS_PROP_USEROBJQUOTA,
|
|
|
|
ZFS_PROP_GROUPOBJUSED,
|
|
|
|
ZFS_PROP_GROUPOBJQUOTA,
|
2018-02-14 01:54:54 +03:00
|
|
|
ZFS_PROP_PROJECTUSED,
|
|
|
|
ZFS_PROP_PROJECTQUOTA,
|
|
|
|
ZFS_PROP_PROJECTOBJUSED,
|
|
|
|
ZFS_PROP_PROJECTOBJQUOTA,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZFS_NUM_USERQUOTA_PROPS
|
|
|
|
} zfs_userquota_prop_t;
|
|
|
|
|
2022-01-15 02:37:55 +03:00
|
|
|
_SYS_FS_ZFS_H const char *const zfs_userquota_prop_prefixes[
|
|
|
|
ZFS_NUM_USERQUOTA_PROPS];
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Pool properties are identified by these constants and must be added to the
|
2008-12-03 23:09:06 +03:00
|
|
|
* end of this list to ensure that external consumers are not affected
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
* by the change. Properties must be registered in zfs_prop_init().
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
typedef enum {
|
2018-01-19 20:22:37 +03:00
|
|
|
ZPOOL_PROP_INVAL = -1,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZPOOL_PROP_NAME,
|
|
|
|
ZPOOL_PROP_SIZE,
|
|
|
|
ZPOOL_PROP_CAPACITY,
|
|
|
|
ZPOOL_PROP_ALTROOT,
|
|
|
|
ZPOOL_PROP_HEALTH,
|
|
|
|
ZPOOL_PROP_GUID,
|
|
|
|
ZPOOL_PROP_VERSION,
|
|
|
|
ZPOOL_PROP_BOOTFS,
|
|
|
|
ZPOOL_PROP_DELEGATION,
|
|
|
|
ZPOOL_PROP_AUTOREPLACE,
|
|
|
|
ZPOOL_PROP_CACHEFILE,
|
|
|
|
ZPOOL_PROP_FAILUREMODE,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZPOOL_PROP_LISTSNAPS,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZPOOL_PROP_AUTOEXPAND,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZPOOL_PROP_DEDUPDITTO,
|
|
|
|
ZPOOL_PROP_DEDUPRATIO,
|
|
|
|
ZPOOL_PROP_FREE,
|
|
|
|
ZPOOL_PROP_ALLOCATED,
|
2010-08-27 01:24:34 +04:00
|
|
|
ZPOOL_PROP_READONLY,
|
2011-06-16 23:56:38 +04:00
|
|
|
ZPOOL_PROP_ASHIFT,
|
2011-11-15 23:01:27 +04:00
|
|
|
ZPOOL_PROP_COMMENT,
|
2012-01-24 06:43:32 +04:00
|
|
|
ZPOOL_PROP_EXPANDSZ,
|
2012-12-14 03:24:15 +04:00
|
|
|
ZPOOL_PROP_FREEING,
|
2014-07-20 00:19:24 +04:00
|
|
|
ZPOOL_PROP_FRAGMENTATION,
|
2014-06-06 01:20:08 +04:00
|
|
|
ZPOOL_PROP_LEAKED,
|
2014-11-03 23:15:08 +03:00
|
|
|
ZPOOL_PROP_MAXBLOCKSIZE,
|
2014-06-21 03:00:11 +04:00
|
|
|
ZPOOL_PROP_TNAME,
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
ZPOOL_PROP_MAXDNODESIZE,
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
ZPOOL_PROP_MULTIHOST,
|
2016-12-17 01:11:29 +03:00
|
|
|
ZPOOL_PROP_CHECKPOINT,
|
2018-08-20 19:52:37 +03:00
|
|
|
ZPOOL_PROP_LOAD_GUID,
|
2019-03-29 19:13:20 +03:00
|
|
|
ZPOOL_PROP_AUTOTRIM,
|
2021-02-18 08:30:45 +03:00
|
|
|
ZPOOL_PROP_COMPATIBILITY,
|
2023-03-10 22:59:53 +03:00
|
|
|
ZPOOL_PROP_BCLONEUSED,
|
|
|
|
ZPOOL_PROP_BCLONESAVED,
|
|
|
|
ZPOOL_PROP_BCLONERATIO,
|
ddt: dedup table quota enforcement
This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool
When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.
dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.
Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
2024-07-25 19:47:36 +03:00
|
|
|
ZPOOL_PROP_DEDUP_TABLE_SIZE,
|
|
|
|
ZPOOL_PROP_DEDUP_TABLE_QUOTA,
|
2024-07-26 19:16:18 +03:00
|
|
|
ZPOOL_PROP_DEDUPCACHED,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZPOOL_NUM_PROPS
|
|
|
|
} zpool_prop_t;
|
|
|
|
|
2020-10-30 18:55:59 +03:00
|
|
|
/* Small enough to not hog a whole line of printout in zpool(8). */
|
2011-11-15 23:01:27 +04:00
|
|
|
#define ZPROP_MAX_COMMENT 32
|
2021-11-30 17:46:25 +03:00
|
|
|
#define ZPROP_BOOLEAN_NA 2
|
2011-11-15 23:01:27 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPROP_VALUE "value"
|
|
|
|
#define ZPROP_SOURCE "source"
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
ZPROP_SRC_NONE = 0x1,
|
|
|
|
ZPROP_SRC_DEFAULT = 0x2,
|
|
|
|
ZPROP_SRC_TEMPORARY = 0x4,
|
|
|
|
ZPROP_SRC_LOCAL = 0x8,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZPROP_SRC_INHERITED = 0x10,
|
|
|
|
ZPROP_SRC_RECEIVED = 0x20
|
2008-11-20 23:01:55 +03:00
|
|
|
} zprop_source_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPROP_SRC_ALL 0x3f
|
|
|
|
|
|
|
|
#define ZPROP_SOURCE_VAL_RECVD "$recvd"
|
|
|
|
#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
|
2016-06-10 03:04:12 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Dataset flag implemented as a special entry in the props zap object
|
|
|
|
* indicating that the dataset has received properties on or after
|
|
|
|
* SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
|
|
|
|
* just as it did in earlier versions, and thereafter, local properties are
|
|
|
|
* preserved.
|
|
|
|
*/
|
|
|
|
#define ZPROP_HAS_RECVD "$hasrecvd"
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
|
|
|
|
ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
|
|
|
|
} zprop_errflags_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef int (*zprop_func)(int, void *);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Properties to be set on the root file system of a new pool
|
|
|
|
* are stuffed into their own nvlist, which is then included in
|
|
|
|
* the properties nvlist with the pool properties.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_ROOTFS_PROPS "root-props-nvl"
|
|
|
|
|
2018-02-08 19:16:23 +03:00
|
|
|
/*
|
|
|
|
* Length of 'written@' and 'written#'
|
|
|
|
*/
|
|
|
|
#define ZFS_WRITTEN_PROP_PREFIX_LEN 8
|
|
|
|
|
2021-11-30 17:46:25 +03:00
|
|
|
/*
|
|
|
|
* VDEV properties are identified by these constants and must be added to the
|
|
|
|
* end of this list to ensure that external consumers are not affected
|
|
|
|
* by the change. If you make any changes to this list, be sure to update
|
|
|
|
* the property table in usr/src/common/zfs/zpool_prop.c.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
|
|
|
VDEV_PROP_INVAL = -1,
|
2022-06-14 21:27:53 +03:00
|
|
|
VDEV_PROP_USERPROP = VDEV_PROP_INVAL,
|
2021-11-30 17:46:25 +03:00
|
|
|
VDEV_PROP_NAME,
|
|
|
|
VDEV_PROP_CAPACITY,
|
|
|
|
VDEV_PROP_STATE,
|
|
|
|
VDEV_PROP_GUID,
|
|
|
|
VDEV_PROP_ASIZE,
|
|
|
|
VDEV_PROP_PSIZE,
|
|
|
|
VDEV_PROP_ASHIFT,
|
|
|
|
VDEV_PROP_SIZE,
|
|
|
|
VDEV_PROP_FREE,
|
|
|
|
VDEV_PROP_ALLOCATED,
|
|
|
|
VDEV_PROP_COMMENT,
|
|
|
|
VDEV_PROP_EXPANDSZ,
|
|
|
|
VDEV_PROP_FRAGMENTATION,
|
|
|
|
VDEV_PROP_BOOTSIZE,
|
|
|
|
VDEV_PROP_PARITY,
|
|
|
|
VDEV_PROP_PATH,
|
|
|
|
VDEV_PROP_DEVID,
|
|
|
|
VDEV_PROP_PHYS_PATH,
|
|
|
|
VDEV_PROP_ENC_PATH,
|
|
|
|
VDEV_PROP_FRU,
|
|
|
|
VDEV_PROP_PARENT,
|
|
|
|
VDEV_PROP_CHILDREN,
|
|
|
|
VDEV_PROP_NUMCHILDREN,
|
|
|
|
VDEV_PROP_READ_ERRORS,
|
|
|
|
VDEV_PROP_WRITE_ERRORS,
|
|
|
|
VDEV_PROP_CHECKSUM_ERRORS,
|
|
|
|
VDEV_PROP_INITIALIZE_ERRORS,
|
|
|
|
VDEV_PROP_OPS_NULL,
|
|
|
|
VDEV_PROP_OPS_READ,
|
|
|
|
VDEV_PROP_OPS_WRITE,
|
|
|
|
VDEV_PROP_OPS_FREE,
|
|
|
|
VDEV_PROP_OPS_CLAIM,
|
|
|
|
VDEV_PROP_OPS_TRIM,
|
|
|
|
VDEV_PROP_BYTES_NULL,
|
|
|
|
VDEV_PROP_BYTES_READ,
|
|
|
|
VDEV_PROP_BYTES_WRITE,
|
|
|
|
VDEV_PROP_BYTES_FREE,
|
|
|
|
VDEV_PROP_BYTES_CLAIM,
|
|
|
|
VDEV_PROP_BYTES_TRIM,
|
|
|
|
VDEV_PROP_REMOVING,
|
|
|
|
VDEV_PROP_ALLOCATING,
|
2022-11-11 00:37:12 +03:00
|
|
|
VDEV_PROP_FAILFAST,
|
2023-01-24 00:14:25 +03:00
|
|
|
VDEV_PROP_CHECKSUM_N,
|
|
|
|
VDEV_PROP_CHECKSUM_T,
|
|
|
|
VDEV_PROP_IO_N,
|
|
|
|
VDEV_PROP_IO_T,
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
VDEV_PROP_RAIDZ_EXPANDING,
|
2024-02-08 20:19:52 +03:00
|
|
|
VDEV_PROP_SLOW_IO_N,
|
|
|
|
VDEV_PROP_SLOW_IO_T,
|
2024-07-24 02:34:09 +03:00
|
|
|
VDEV_PROP_TRIM_SUPPORT,
|
|
|
|
VDEV_PROP_TRIM_ERRORS,
|
|
|
|
VDEV_PROP_SLOW_IOS,
|
2021-11-30 17:46:25 +03:00
|
|
|
VDEV_NUM_PROPS
|
|
|
|
} vdev_prop_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Dataset property functions shared between libzfs and kernel.
|
|
|
|
*/
|
2021-05-15 12:53:14 +03:00
|
|
|
_SYS_FS_ZFS_H const char *zfs_prop_default_string(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_readonly(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_visible(zfs_prop_t prop);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_inheritable(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_setonce(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_encryption_key_param(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_valid_keylocation(const char *, boolean_t);
|
|
|
|
_SYS_FS_ZFS_H const char *zfs_prop_to_name(zfs_prop_t);
|
|
|
|
_SYS_FS_ZFS_H zfs_prop_t zfs_name_to_prop(const char *);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_user(const char *);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_userquota(const char *);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_written(const char *);
|
|
|
|
_SYS_FS_ZFS_H int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
|
|
|
|
_SYS_FS_ZFS_H int zfs_prop_string_to_index(zfs_prop_t, const char *,
|
|
|
|
uint64_t *);
|
|
|
|
_SYS_FS_ZFS_H uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Pool property functions shared between libzfs and kernel.
|
|
|
|
*/
|
2021-05-15 12:53:14 +03:00
|
|
|
_SYS_FS_ZFS_H zpool_prop_t zpool_name_to_prop(const char *);
|
|
|
|
_SYS_FS_ZFS_H const char *zpool_prop_to_name(zpool_prop_t);
|
|
|
|
_SYS_FS_ZFS_H const char *zpool_prop_default_string(zpool_prop_t);
|
|
|
|
_SYS_FS_ZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zpool_prop_readonly(zpool_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zpool_prop_setonce(zpool_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zpool_prop_feature(const char *);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zpool_prop_unsupported(const char *);
|
|
|
|
_SYS_FS_ZFS_H int zpool_prop_index_to_string(zpool_prop_t, uint64_t,
|
|
|
|
const char **);
|
|
|
|
_SYS_FS_ZFS_H int zpool_prop_string_to_index(zpool_prop_t, const char *,
|
|
|
|
uint64_t *);
|
|
|
|
_SYS_FS_ZFS_H uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2021-11-30 17:46:25 +03:00
|
|
|
/*
|
|
|
|
* VDEV property functions shared between libzfs and kernel.
|
|
|
|
*/
|
|
|
|
_SYS_FS_ZFS_H vdev_prop_t vdev_name_to_prop(const char *);
|
|
|
|
_SYS_FS_ZFS_H boolean_t vdev_prop_user(const char *name);
|
|
|
|
_SYS_FS_ZFS_H const char *vdev_prop_to_name(vdev_prop_t);
|
|
|
|
_SYS_FS_ZFS_H const char *vdev_prop_default_string(vdev_prop_t);
|
|
|
|
_SYS_FS_ZFS_H uint64_t vdev_prop_default_numeric(vdev_prop_t);
|
|
|
|
_SYS_FS_ZFS_H boolean_t vdev_prop_readonly(vdev_prop_t prop);
|
|
|
|
_SYS_FS_ZFS_H int vdev_prop_index_to_string(vdev_prop_t, uint64_t,
|
|
|
|
const char **);
|
|
|
|
_SYS_FS_ZFS_H int vdev_prop_string_to_index(vdev_prop_t, const char *,
|
|
|
|
uint64_t *);
|
|
|
|
_SYS_FS_ZFS_H boolean_t zpool_prop_vdev(const char *name);
|
|
|
|
_SYS_FS_ZFS_H uint64_t vdev_prop_random_value(vdev_prop_t prop, uint64_t seed);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Definitions for the Delegation.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
|
|
|
ZFS_DELEG_WHO_UNKNOWN = 0,
|
|
|
|
ZFS_DELEG_USER = 'u',
|
|
|
|
ZFS_DELEG_USER_SETS = 'U',
|
|
|
|
ZFS_DELEG_GROUP = 'g',
|
|
|
|
ZFS_DELEG_GROUP_SETS = 'G',
|
|
|
|
ZFS_DELEG_EVERYONE = 'e',
|
|
|
|
ZFS_DELEG_EVERYONE_SETS = 'E',
|
|
|
|
ZFS_DELEG_CREATE = 'c',
|
|
|
|
ZFS_DELEG_CREATE_SETS = 'C',
|
|
|
|
ZFS_DELEG_NAMED_SET = 's',
|
|
|
|
ZFS_DELEG_NAMED_SET_SETS = 'S'
|
|
|
|
} zfs_deleg_who_type_t;
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
ZFS_DELEG_NONE = 0,
|
|
|
|
ZFS_DELEG_PERM_LOCAL = 1,
|
|
|
|
ZFS_DELEG_PERM_DESCENDENT = 2,
|
|
|
|
ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
|
|
|
|
ZFS_DELEG_PERM_CREATE = 4
|
|
|
|
} zfs_deleg_inherit_t;
|
|
|
|
|
|
|
|
#define ZFS_DELEG_PERM_UID "uid"
|
|
|
|
#define ZFS_DELEG_PERM_GID "gid"
|
|
|
|
#define ZFS_DELEG_PERM_GROUPS "groups"
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZFS_MLSLABEL_DEFAULT "none"
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZFS_SMB_ACL_SRC "src"
|
|
|
|
#define ZFS_SMB_ACL_TARGET "target"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_CANMOUNT_OFF = 0,
|
|
|
|
ZFS_CANMOUNT_ON = 1,
|
|
|
|
ZFS_CANMOUNT_NOAUTO = 2
|
|
|
|
} zfs_canmount_type_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_LOGBIAS_LATENCY = 0,
|
|
|
|
ZFS_LOGBIAS_THROUGHPUT = 1
|
|
|
|
} zfs_logbias_op_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef enum zfs_share_op {
|
|
|
|
ZFS_SHARE_NFS = 0,
|
|
|
|
ZFS_UNSHARE_NFS = 1,
|
|
|
|
ZFS_SHARE_SMB = 2,
|
|
|
|
ZFS_UNSHARE_SMB = 3
|
|
|
|
} zfs_share_op_t;
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
typedef enum zfs_smb_acl_op {
|
|
|
|
ZFS_SMB_ACL_ADD,
|
|
|
|
ZFS_SMB_ACL_REMOVE,
|
|
|
|
ZFS_SMB_ACL_RENAME,
|
|
|
|
ZFS_SMB_ACL_PURGE
|
|
|
|
} zfs_smb_acl_op_t;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
typedef enum zfs_cache_type {
|
|
|
|
ZFS_CACHE_NONE = 0,
|
|
|
|
ZFS_CACHE_METADATA = 1,
|
|
|
|
ZFS_CACHE_ALL = 2
|
|
|
|
} zfs_cache_type_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_SYNC_STANDARD = 0,
|
|
|
|
ZFS_SYNC_ALWAYS = 1,
|
|
|
|
ZFS_SYNC_DISABLED = 2
|
|
|
|
} zfs_sync_type_t;
|
|
|
|
|
2011-10-25 03:55:20 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_XATTR_OFF = 0,
|
|
|
|
ZFS_XATTR_DIR = 1,
|
|
|
|
ZFS_XATTR_SA = 2
|
|
|
|
} zfs_xattr_type_t;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_DNSIZE_LEGACY = 0,
|
|
|
|
ZFS_DNSIZE_AUTO = 1,
|
|
|
|
ZFS_DNSIZE_1K = 1024,
|
|
|
|
ZFS_DNSIZE_2K = 2048,
|
|
|
|
ZFS_DNSIZE_4K = 4096,
|
|
|
|
ZFS_DNSIZE_8K = 8192,
|
|
|
|
ZFS_DNSIZE_16K = 16384
|
|
|
|
} zfs_dnsize_type_t;
|
|
|
|
|
2014-05-23 20:21:07 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_REDUNDANT_METADATA_ALL,
|
2022-10-20 03:07:51 +03:00
|
|
|
ZFS_REDUNDANT_METADATA_MOST,
|
|
|
|
ZFS_REDUNDANT_METADATA_SOME,
|
|
|
|
ZFS_REDUNDANT_METADATA_NONE
|
2014-05-23 20:21:07 +04:00
|
|
|
} zfs_redundant_metadata_type_t;
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_VOLMODE_DEFAULT = 0,
|
|
|
|
ZFS_VOLMODE_GEOM = 1,
|
|
|
|
ZFS_VOLMODE_DEV = 2,
|
|
|
|
ZFS_VOLMODE_NONE = 3
|
|
|
|
} zfs_volmode_t;
|
|
|
|
|
Adding Direct IO Support
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads.
O_DIRECT support in ZFS will always ensure there is coherency between
buffered and O_DIRECT IO requests. This ensures that all IO requests,
whether buffered or direct, will see the same file contents at all
times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While
data is written directly to VDEV disks, metadata will not be synced
until the associated TXG is synced.
For both O_DIRECT read and write request the offset and request sizes,
at a minimum, must be PAGE_SIZE aligned. In the event they are not,
then EINVAL is returned unless the direct property is set to always (see
below).
For O_DIRECT writes:
The request also must be block aligned (recordsize) or the write
request will take the normal (buffered) write path. In the event that
request is block aligned and a cached copy of the buffer in the ARC,
then it will be discarded from the ARC forcing all further reads to
retrieve the data from disk.
For O_DIRECT reads:
The only alignment restrictions are PAGE_SIZE alignment. In the event
that the requested data is in buffered (in the ARC) it will just be
copied from the ARC into the user buffer.
For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in
the event that file contents are mmap'ed. In this case, all requests
that are at least PAGE_SIZE aligned will just fall back to the buffered
paths. If the request however is not PAGE_SIZE aligned, EINVAL will
be returned as always regardless if the file's contents are mmap'ed.
Since O_DIRECT writes go through the normal ZIO pipeline, the
following operations are supported just as with normal buffered writes:
Checksum
Compression
Encryption
Erasure Coding
There is one caveat for the data integrity of O_DIRECT writes that is
distinct for each of the OS's supported by ZFS.
FreeBSD - FreeBSD is able to place user pages under write protection so
any data in the user buffers and written directly down to the
VDEV disks is guaranteed to not change. There is no concern
with data integrity and O_DIRECT writes.
Linux - Linux is not able to place anonymous user pages under write
protection. Because of this, if the user decides to manipulate
the page contents while the write operation is occurring, data
integrity can not be guaranteed. However, there is a module
parameter `zfs_vdev_direct_write_verify` that controls the
if a O_DIRECT writes that can occur to a top-level VDEV before
a checksum verify is run before the contents of the I/O buffer
are committed to disk. In the event of a checksum verification
failure the write will return EIO. The number of O_DIRECT write
checksum verification errors can be observed by doing
`zpool status -d`, which will list all verification errors that
have occurred on a top-level VDEV. Along with `zpool status`, a
ZED event will be issues as `dio_verify` when a checksum
verification error occurs.
ZVOLs and dedup is not currently supported with Direct I/O.
A new dataset property `direct` has been added with the following 3
allowable values:
disabled - Accepts O_DIRECT flag, but silently ignores it and treats
the request as a buffered IO request.
standard - Follows the alignment restrictions outlined above for
write/read IO requests when the O_DIRECT flag is used.
always - Treats every write/read IO request as though it passed
O_DIRECT and will do O_DIRECT if the alignment restrictions
are met otherwise will redirect through the ARC. This
property will not allow a request to fail.
There is also a module parameter zfs_dio_enabled that can be used to
force all reads and writes through the ARC. By setting this module
parameter to 0, it mimics as if the direct dataset property is set to
disabled.
Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Co-authored-by: Mark Maybee <mark.maybee@delphix.com>
Co-authored-by: Matt Macy <mmacy@FreeBSD.org>
Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov>
Closes #10018
2024-09-14 23:47:59 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_DIRECT_DISABLED = 0,
|
|
|
|
ZFS_DIRECT_STANDARD,
|
|
|
|
ZFS_DIRECT_ALWAYS
|
|
|
|
} zfs_direct_t;
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
typedef enum zfs_keystatus {
|
|
|
|
ZFS_KEYSTATUS_NONE = 0,
|
|
|
|
ZFS_KEYSTATUS_UNAVAILABLE,
|
|
|
|
ZFS_KEYSTATUS_AVAILABLE,
|
|
|
|
} zfs_keystatus_t;
|
|
|
|
|
|
|
|
typedef enum zfs_keyformat {
|
|
|
|
ZFS_KEYFORMAT_NONE = 0,
|
|
|
|
ZFS_KEYFORMAT_RAW,
|
|
|
|
ZFS_KEYFORMAT_HEX,
|
|
|
|
ZFS_KEYFORMAT_PASSPHRASE,
|
|
|
|
ZFS_KEYFORMAT_FORMATS
|
|
|
|
} zfs_keyformat_t;
|
|
|
|
|
|
|
|
typedef enum zfs_key_location {
|
|
|
|
ZFS_KEYLOCATION_NONE = 0,
|
|
|
|
ZFS_KEYLOCATION_PROMPT,
|
|
|
|
ZFS_KEYLOCATION_URI,
|
|
|
|
ZFS_KEYLOCATION_LOCATIONS
|
|
|
|
} zfs_keylocation_t;
|
|
|
|
|
2023-10-24 21:00:07 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_PREFETCH_NONE = 0,
|
|
|
|
ZFS_PREFETCH_METADATA = 1,
|
|
|
|
ZFS_PREFETCH_ALL = 2
|
|
|
|
} zfs_prefetch_type_t;
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#define DEFAULT_PBKDF2_ITERATIONS 350000
|
|
|
|
#define MIN_PBKDF2_ITERATIONS 100000
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* On-disk version number.
|
|
|
|
*/
|
|
|
|
#define SPA_VERSION_1 1ULL
|
|
|
|
#define SPA_VERSION_2 2ULL
|
|
|
|
#define SPA_VERSION_3 3ULL
|
|
|
|
#define SPA_VERSION_4 4ULL
|
|
|
|
#define SPA_VERSION_5 5ULL
|
|
|
|
#define SPA_VERSION_6 6ULL
|
|
|
|
#define SPA_VERSION_7 7ULL
|
|
|
|
#define SPA_VERSION_8 8ULL
|
|
|
|
#define SPA_VERSION_9 9ULL
|
|
|
|
#define SPA_VERSION_10 10ULL
|
2008-12-03 23:09:06 +03:00
|
|
|
#define SPA_VERSION_11 11ULL
|
|
|
|
#define SPA_VERSION_12 12ULL
|
|
|
|
#define SPA_VERSION_13 13ULL
|
|
|
|
#define SPA_VERSION_14 14ULL
|
2009-07-03 02:44:48 +04:00
|
|
|
#define SPA_VERSION_15 15ULL
|
|
|
|
#define SPA_VERSION_16 16ULL
|
2009-08-18 22:43:27 +04:00
|
|
|
#define SPA_VERSION_17 17ULL
|
|
|
|
#define SPA_VERSION_18 18ULL
|
2010-05-29 00:45:14 +04:00
|
|
|
#define SPA_VERSION_19 19ULL
|
|
|
|
#define SPA_VERSION_20 20ULL
|
|
|
|
#define SPA_VERSION_21 21ULL
|
|
|
|
#define SPA_VERSION_22 22ULL
|
|
|
|
#define SPA_VERSION_23 23ULL
|
|
|
|
#define SPA_VERSION_24 24ULL
|
|
|
|
#define SPA_VERSION_25 25ULL
|
|
|
|
#define SPA_VERSION_26 26ULL
|
2010-08-27 01:24:34 +04:00
|
|
|
#define SPA_VERSION_27 27ULL
|
|
|
|
#define SPA_VERSION_28 28ULL
|
2012-12-14 03:24:15 +04:00
|
|
|
#define SPA_VERSION_5000 5000ULL
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2019-04-16 07:59:37 +03:00
|
|
|
/*
|
|
|
|
* The incrementing pool version number has been replaced by pool feature
|
|
|
|
* flags. For more details, see zfeature.c.
|
|
|
|
*/
|
2012-12-14 03:24:15 +04:00
|
|
|
#define SPA_VERSION SPA_VERSION_5000
|
|
|
|
#define SPA_VERSION_STRING "5000"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Symbolic names for the changes that caused a SPA_VERSION switch.
|
|
|
|
* Used in the code when checking for presence or absence of a feature.
|
|
|
|
* Feel free to define multiple symbolic names for each version if there
|
|
|
|
* were multiple changes to on-disk structures during that version.
|
|
|
|
*
|
|
|
|
* NOTE: When checking the current SPA_VERSION in your code, be sure
|
|
|
|
* to use spa_version() since it reports the version of the
|
|
|
|
* last synced uberblock. Checking the in-flight version can
|
|
|
|
* be dangerous in some cases.
|
|
|
|
*/
|
|
|
|
#define SPA_VERSION_INITIAL SPA_VERSION_1
|
|
|
|
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
|
|
|
|
#define SPA_VERSION_SPARES SPA_VERSION_3
|
2009-08-18 22:43:27 +04:00
|
|
|
#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
|
2010-05-29 00:45:14 +04:00
|
|
|
#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
|
2008-11-20 23:01:55 +03:00
|
|
|
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
|
|
|
|
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
|
|
|
|
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
|
|
|
|
#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
|
|
|
|
#define SPA_VERSION_BOOTFS SPA_VERSION_6
|
|
|
|
#define SPA_VERSION_SLOGS SPA_VERSION_7
|
|
|
|
#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
|
|
|
|
#define SPA_VERSION_FUID SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_REFQUOTA SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_L2CACHE SPA_VERSION_10
|
2008-12-03 23:09:06 +03:00
|
|
|
#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
|
|
|
|
#define SPA_VERSION_ORIGIN SPA_VERSION_11
|
|
|
|
#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
|
|
|
|
#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
|
|
|
|
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
|
|
|
|
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
|
2009-07-03 02:44:48 +04:00
|
|
|
#define SPA_VERSION_USERSPACE SPA_VERSION_15
|
|
|
|
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
|
2009-08-18 22:43:27 +04:00
|
|
|
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
|
|
|
|
#define SPA_VERSION_USERREFS SPA_VERSION_18
|
2010-05-29 00:45:14 +04:00
|
|
|
#define SPA_VERSION_HOLES SPA_VERSION_19
|
|
|
|
#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
|
|
|
|
#define SPA_VERSION_DEDUP SPA_VERSION_21
|
|
|
|
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
|
|
|
|
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
|
|
|
|
#define SPA_VERSION_SA SPA_VERSION_24
|
|
|
|
#define SPA_VERSION_SCAN SPA_VERSION_25
|
|
|
|
#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
|
|
|
|
#define SPA_VERSION_DEADLISTS SPA_VERSION_26
|
2010-08-27 01:24:34 +04:00
|
|
|
#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
|
|
|
|
#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
|
2012-12-14 03:24:15 +04:00
|
|
|
#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
|
|
|
|
#define SPA_VERSION_FEATURES SPA_VERSION_5000
|
|
|
|
|
|
|
|
#define SPA_VERSION_IS_SUPPORTED(v) \
|
|
|
|
(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
|
|
|
|
((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ZPL version - rev'd whenever an incompatible on-disk format change
|
|
|
|
* occurs. This is independent of SPA/DMU/ZAP versioning. You must
|
|
|
|
* also update the version_table[] and help message in zfs_prop.c.
|
|
|
|
*/
|
|
|
|
#define ZPL_VERSION_1 1ULL
|
|
|
|
#define ZPL_VERSION_2 2ULL
|
|
|
|
#define ZPL_VERSION_3 3ULL
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZPL_VERSION_4 4ULL
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPL_VERSION_5 5ULL
|
|
|
|
#define ZPL_VERSION ZPL_VERSION_5
|
|
|
|
#define ZPL_VERSION_STRING "5"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#define ZPL_VERSION_INITIAL ZPL_VERSION_1
|
|
|
|
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
|
|
|
|
#define ZPL_VERSION_FUID ZPL_VERSION_3
|
|
|
|
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
|
|
|
|
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPL_VERSION_SA ZPL_VERSION_5
|
|
|
|
|
2020-04-10 20:33:35 +03:00
|
|
|
/* Persistent L2ARC version */
|
|
|
|
#define L2ARC_PERSISTENT_VERSION_1 1ULL
|
|
|
|
#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1
|
|
|
|
#define L2ARC_PERSISTENT_VERSION_STRING "1"
|
|
|
|
|
2017-02-11 01:51:09 +03:00
|
|
|
/* Rewind policy information */
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
|
|
|
|
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
|
|
|
|
#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
|
|
|
|
#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
|
|
|
|
#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
|
|
|
|
#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
|
|
|
|
#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
|
|
|
|
|
2017-02-11 01:51:09 +03:00
|
|
|
typedef struct zpool_load_policy {
|
|
|
|
uint32_t zlp_rewind; /* rewind policy requested */
|
|
|
|
uint64_t zlp_maxmeta; /* max acceptable meta-data errors */
|
|
|
|
uint64_t zlp_maxdata; /* max acceptable data errors */
|
|
|
|
uint64_t zlp_txg; /* specific txg to load */
|
|
|
|
} zpool_load_policy_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following are configuration names used in the nvlist describing a pool's
|
2020-02-18 20:36:50 +03:00
|
|
|
* configuration. New on-disk names should be prefixed with "<reversed-DNS>:"
|
|
|
|
* (e.g. "org.openzfs:") to avoid conflicting names being developed
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
* independently.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
#define ZPOOL_CONFIG_VERSION "version"
|
|
|
|
#define ZPOOL_CONFIG_POOL_NAME "name"
|
|
|
|
#define ZPOOL_CONFIG_POOL_STATE "state"
|
|
|
|
#define ZPOOL_CONFIG_POOL_TXG "txg"
|
|
|
|
#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
|
|
|
|
#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
|
|
|
|
#define ZPOOL_CONFIG_TOP_GUID "top_guid"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
|
|
|
|
#define ZPOOL_CONFIG_TYPE "type"
|
|
|
|
#define ZPOOL_CONFIG_CHILDREN "children"
|
|
|
|
#define ZPOOL_CONFIG_ID "id"
|
|
|
|
#define ZPOOL_CONFIG_GUID "guid"
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object"
|
|
|
|
#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births"
|
|
|
|
#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_PATH "path"
|
|
|
|
#define ZPOOL_CONFIG_DEVID "devid"
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
#define ZPOOL_CONFIG_SPARE_ID "spareid"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
|
|
|
|
#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
|
|
|
|
#define ZPOOL_CONFIG_ASHIFT "ashift"
|
|
|
|
#define ZPOOL_CONFIG_ASIZE "asize"
|
|
|
|
#define ZPOOL_CONFIG_DTL "DTL"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */
|
2016-12-17 01:11:29 +03:00
|
|
|
#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
#define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* container nvlist of extended stats */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex"
|
|
|
|
|
|
|
|
/* Active queue read/write stats */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
|
2019-03-29 19:13:20 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue"
|
2021-08-26 21:26:49 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE "vdev_rebuild_active_queue"
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Queue sizes */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
|
2019-03-29 19:13:20 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue"
|
2021-08-26 21:26:49 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE "vdev_rebuild_pend_queue"
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Latency read/write histogram stats */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
|
2019-03-29 19:13:20 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo"
|
2021-08-26 21:26:49 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO "vdev_rebuild_histo"
|
2016-02-29 21:05:23 +03:00
|
|
|
|
2016-05-26 00:21:35 +03:00
|
|
|
/* Request size histograms */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
|
2019-03-29 19:13:20 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
|
2021-08-26 21:26:49 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO "vdev_ind_rebuild_histo"
|
2016-05-26 00:21:35 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
|
2019-03-29 19:13:20 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo"
|
2021-08-26 21:26:49 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO "vdev_agg_rebuild_histo"
|
2016-02-29 21:05:23 +03:00
|
|
|
|
2018-11-09 03:47:24 +03:00
|
|
|
/* Number of slow IOs */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
|
|
|
|
|
Adding Direct IO Support
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads.
O_DIRECT support in ZFS will always ensure there is coherency between
buffered and O_DIRECT IO requests. This ensures that all IO requests,
whether buffered or direct, will see the same file contents at all
times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While
data is written directly to VDEV disks, metadata will not be synced
until the associated TXG is synced.
For both O_DIRECT read and write request the offset and request sizes,
at a minimum, must be PAGE_SIZE aligned. In the event they are not,
then EINVAL is returned unless the direct property is set to always (see
below).
For O_DIRECT writes:
The request also must be block aligned (recordsize) or the write
request will take the normal (buffered) write path. In the event that
request is block aligned and a cached copy of the buffer in the ARC,
then it will be discarded from the ARC forcing all further reads to
retrieve the data from disk.
For O_DIRECT reads:
The only alignment restrictions are PAGE_SIZE alignment. In the event
that the requested data is in buffered (in the ARC) it will just be
copied from the ARC into the user buffer.
For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in
the event that file contents are mmap'ed. In this case, all requests
that are at least PAGE_SIZE aligned will just fall back to the buffered
paths. If the request however is not PAGE_SIZE aligned, EINVAL will
be returned as always regardless if the file's contents are mmap'ed.
Since O_DIRECT writes go through the normal ZIO pipeline, the
following operations are supported just as with normal buffered writes:
Checksum
Compression
Encryption
Erasure Coding
There is one caveat for the data integrity of O_DIRECT writes that is
distinct for each of the OS's supported by ZFS.
FreeBSD - FreeBSD is able to place user pages under write protection so
any data in the user buffers and written directly down to the
VDEV disks is guaranteed to not change. There is no concern
with data integrity and O_DIRECT writes.
Linux - Linux is not able to place anonymous user pages under write
protection. Because of this, if the user decides to manipulate
the page contents while the write operation is occurring, data
integrity can not be guaranteed. However, there is a module
parameter `zfs_vdev_direct_write_verify` that controls the
if a O_DIRECT writes that can occur to a top-level VDEV before
a checksum verify is run before the contents of the I/O buffer
are committed to disk. In the event of a checksum verification
failure the write will return EIO. The number of O_DIRECT write
checksum verification errors can be observed by doing
`zpool status -d`, which will list all verification errors that
have occurred on a top-level VDEV. Along with `zpool status`, a
ZED event will be issues as `dio_verify` when a checksum
verification error occurs.
ZVOLs and dedup is not currently supported with Direct I/O.
A new dataset property `direct` has been added with the following 3
allowable values:
disabled - Accepts O_DIRECT flag, but silently ignores it and treats
the request as a buffered IO request.
standard - Follows the alignment restrictions outlined above for
write/read IO requests when the O_DIRECT flag is used.
always - Treats every write/read IO request as though it passed
O_DIRECT and will do O_DIRECT if the alignment restrictions
are met otherwise will redirect through the ARC. This
property will not allow a request to fail.
There is also a module parameter zfs_dio_enabled that can be used to
force all reads and writes through the ARC. By setting this module
parameter to 0, it mimics as if the direct dataset property is set to
disabled.
Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Co-authored-by: Mark Maybee <mark.maybee@delphix.com>
Co-authored-by: Matt Macy <mmacy@FreeBSD.org>
Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov>
Closes #10018
2024-09-14 23:47:59 +03:00
|
|
|
/* Number of Direct I/O write verify errors */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors"
|
|
|
|
|
2016-10-24 20:45:59 +03:00
|
|
|
/* vdev enclosure sysfs path */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
|
|
|
|
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
|
|
|
|
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
|
|
|
|
#define ZPOOL_CONFIG_SPARES "spares"
|
|
|
|
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
|
|
|
|
#define ZPOOL_CONFIG_NPARITY "nparity"
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
#define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding"
|
|
|
|
#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_HOSTID "hostid"
|
|
|
|
#define ZPOOL_CONFIG_HOSTNAME "hostname"
|
2010-08-27 01:24:34 +04:00
|
|
|
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_UNSPARE "unspare"
|
|
|
|
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
|
|
|
|
#define ZPOOL_CONFIG_IS_LOG "is_log"
|
|
|
|
#define ZPOOL_CONFIG_L2CACHE "l2cache"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
|
|
|
|
#define ZPOOL_CONFIG_IS_HOLE "is_hole"
|
|
|
|
#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
|
|
|
|
#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
|
|
|
|
#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
|
|
|
|
#define ZPOOL_CONFIG_SPLIT "splitcfg"
|
|
|
|
#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
|
|
|
|
#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
|
|
|
|
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
|
2021-11-30 17:46:25 +03:00
|
|
|
#define ZPOOL_CONFIG_NONALLOCATING "non_allocating"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_REMOVING "removing"
|
2013-08-08 00:16:22 +04:00
|
|
|
#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
|
2020-07-03 21:05:50 +03:00
|
|
|
#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg"
|
2011-11-15 23:01:27 +04:00
|
|
|
#define ZPOOL_CONFIG_COMMENT "comment"
|
2008-12-03 23:09:06 +03:00
|
|
|
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
|
2018-03-15 20:56:55 +03:00
|
|
|
#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
|
2010-08-27 01:24:34 +04:00
|
|
|
#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
|
2012-12-14 03:24:15 +04:00
|
|
|
#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */
|
2012-12-15 03:00:45 +04:00
|
|
|
#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */
|
2012-12-14 03:24:15 +04:00
|
|
|
#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
|
|
|
|
#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */
|
2014-02-21 07:57:17 +04:00
|
|
|
#define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */
|
2023-04-20 20:07:56 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root"
|
2016-04-11 23:16:57 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
|
|
|
|
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
|
2018-10-19 07:06:18 +03:00
|
|
|
#define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer"
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
|
MMP interval and fail_intervals in uberblock
When Multihost is enabled, and a pool is imported, uberblock writes
include ub_mmp_delay to allow an importing node to calculate the
duration of an activity test. This value, is not enough information.
If zfs_multihost_fail_intervals > 0 on the node with the pool imported,
the safe minimum duration of the activity test is well defined, but does
not depend on ub_mmp_delay:
zfs_multihost_fail_intervals * zfs_multihost_interval
and if zfs_multihost_fail_intervals == 0 on that node, there is no such
well defined safe duration, but the importing host cannot tell whether
mmp_delay is high due to I/O delays, or due to a very large
zfs_multihost_interval setting on the host which last imported the pool.
As a result, it may use a far longer period for the activity test than
is necessary.
This patch renames ub_mmp_sequence to ub_mmp_config and uses it to
record the zfs_multihost_interval and zfs_multihost_fail_intervals
values, as well as the mmp sequence. This allows a shorter activity
test duration to be calculated by the importing host in most situations.
These values are also added to the multihost_history kstat records.
It calculates the activity test duration differently depending on
whether the new fields are present or not; for importing pools with
only ub_mmp_delay, it uses
(zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
Which results in an activity test duration less sensitive to the leaf
count.
In addition, it makes a few other improvements:
* It updates the "sequence" part of ub_mmp_config when MMP writes
in between syncs occur. This allows an importing host to detect MMP
on the remote host sooner, when the pool is idle, as it is not limited
to the granularity of ub_timestamp (1 second).
* It issues writes immediately when zfs_multihost_interval is changed
so remote hosts see the updated value as soon as possible.
* It fixes a bug where setting zfs_multihost_fail_intervals = 1 results
in immediate pool suspension.
* Update tests to verify activity check duration is based on recorded
tunable values, not tunable values on importing host.
* Update tests to verify the expected number of uberblocks have valid
MMP fields - fail_intervals, mmp_interval, mmp_seq (sequence number),
that sequence number is incrementing, and that uberblock values match
tunable settings.
Reviewed-by: Andreas Dilger <andreas.dilger@whamcloud.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #7842
2019-03-21 22:47:57 +03:00
|
|
|
#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
|
2018-09-06 04:33:36 +03:00
|
|
|
#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
|
2018-09-19 00:45:52 +03:00
|
|
|
#define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */
|
2020-07-03 21:05:50 +03:00
|
|
|
#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats"
|
2021-02-18 08:30:45 +03:00
|
|
|
#define ZPOOL_CONFIG_COMPATIBILITY "compatibility"
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* The persistent vdev state is stored as separate values rather than a single
|
|
|
|
* 'vdev_state' entry. This is because a device can be in multiple states, such
|
|
|
|
* as offline and degraded.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_CONFIG_OFFLINE "offline"
|
|
|
|
#define ZPOOL_CONFIG_FAULTED "faulted"
|
|
|
|
#define ZPOOL_CONFIG_DEGRADED "degraded"
|
|
|
|
#define ZPOOL_CONFIG_REMOVED "removed"
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZPOOL_CONFIG_FRU "fru"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_AUX_STATE "aux_state"
|
|
|
|
|
2017-02-11 01:51:09 +03:00
|
|
|
/* Pool load policy parameters */
|
|
|
|
#define ZPOOL_LOAD_POLICY "load-policy"
|
|
|
|
#define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy"
|
|
|
|
#define ZPOOL_LOAD_REQUEST_TXG "load-request-txg"
|
|
|
|
#define ZPOOL_LOAD_META_THRESH "load-meta-thresh"
|
|
|
|
#define ZPOOL_LOAD_DATA_THRESH "load-data-thresh"
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/* Rewind data discovered */
|
|
|
|
#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
|
2022-02-05 00:06:38 +03:00
|
|
|
#define ZPOOL_CONFIG_LOAD_META_ERRORS "verify_meta_errors"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
|
|
|
|
#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
/* dRAID configuration */
|
|
|
|
#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata"
|
|
|
|
#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares"
|
|
|
|
#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define VDEV_TYPE_ROOT "root"
|
|
|
|
#define VDEV_TYPE_MIRROR "mirror"
|
|
|
|
#define VDEV_TYPE_REPLACING "replacing"
|
|
|
|
#define VDEV_TYPE_RAIDZ "raidz"
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
#define VDEV_TYPE_DRAID "draid"
|
|
|
|
#define VDEV_TYPE_DRAID_SPARE "dspare"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define VDEV_TYPE_DISK "disk"
|
|
|
|
#define VDEV_TYPE_FILE "file"
|
|
|
|
#define VDEV_TYPE_MISSING "missing"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define VDEV_TYPE_HOLE "hole"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define VDEV_TYPE_SPARE "spare"
|
|
|
|
#define VDEV_TYPE_LOG "log"
|
|
|
|
#define VDEV_TYPE_L2CACHE "l2cache"
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
#define VDEV_TYPE_INDIRECT "indirect"
|
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
#define VDEV_RAIDZ_MAXPARITY 3
|
|
|
|
|
|
|
|
#define VDEV_DRAID_MAXPARITY 3
|
|
|
|
#define VDEV_DRAID_MIN_CHILDREN 2
|
|
|
|
#define VDEV_DRAID_MAX_CHILDREN UINT8_MAX
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
|
|
|
|
#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
|
|
|
|
"com.delphix:indirect_obsolete_sm"
|
|
|
|
#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \
|
|
|
|
"com.delphix:obsolete_counts_are_precise"
|
2016-12-17 01:11:29 +03:00
|
|
|
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
|
|
|
|
"com.delphix:pool_checkpoint_sm"
|
Log Spacemap Project
= Motivation
At Delphix we've seen a lot of customer systems where fragmentation
is over 75% and random writes take a performance hit because a lot
of time is spend on I/Os that update on-disk space accounting metadata.
Specifically, we seen cases where 20% to 40% of sync time is spend
after sync pass 1 and ~30% of the I/Os on the system is spent updating
spacemaps.
The problem is that these pools have existed long enough that we've
touched almost every metaslab at least once, and random writes
scatter frees across all metaslabs every TXG, thus appending to
their spacemaps and resulting in many I/Os. To give an example,
assuming that every VDEV has 200 metaslabs and our writes fit within
a single spacemap block (generally 4K) we have 200 I/Os. Then if we
assume 2 levels of indirection, we need 400 additional I/Os and
since we are talking about metadata for which we keep 2 extra copies
for redundancy we need to triple that number, leading to a total of
1800 I/Os per VDEV every TXG.
We could try and decrease the number of metaslabs so we have less
I/Os per TXG but then each metaslab would cover a wider range on
disk and thus would take more time to be loaded in memory from disk.
In addition, after it's loaded, it's range tree would consume more
memory.
Another idea would be to just increase the spacemap block size
which would allow us to fit more entries within an I/O block
resulting in fewer I/Os per metaslab and a speedup in loading time.
The problem is still that we don't deal with the number of I/Os
going up as the number of metaslabs is increasing and the fact
is that we generally write a lot to a few metaslabs and a little
to the rest of them. Thus, just increasing the block size would
actually waste bandwidth because we won't be utilizing our bigger
block size.
= About this patch
This patch introduces the Log Spacemap project which provides the
solution to the above problem while taking into account all the
aforementioned tradeoffs. The details on how it achieves that can
be found in the references sections below and in the code (see
Big Theory Statement in spa_log_spacemap.c).
Even though the change is fairly constraint within the metaslab
and lower-level SPA codepaths, there is a side-change that is
user-facing. The change is that VDEV IDs from VDEV holes will no
longer be reused. To give some background and reasoning for this,
when a log device is removed and its VDEV structure was replaced
with a hole (or was compacted; if at the end of the vdev array),
its vdev_id could be reused by devices added after that. Now
with the pool-wide space maps recording the vdev ID, this behavior
can cause problems (e.g. is this entry referring to a segment in
the new vdev or the removed log?). Thus, to simplify things the
ID reuse behavior is gone and now vdev IDs for top-level vdevs
are truly unique within a pool.
= Testing
The illumos implementation of this feature has been used internally
for a year and has been in production for ~6 months. For this patch
specifically there don't seem to be any regressions introduced to
ZTS and I have been running zloop for a week without any related
problems.
= Performance Analysis (Linux Specific)
All performance results and analysis for illumos can be found in
the links of the references. Redoing the same experiments in Linux
gave similar results. Below are the specifics of the Linux run.
After the pool reached stable state the percentage of the time
spent in pass 1 per TXG was 64% on average for the stock bits
while the log spacemap bits stayed at 95% during the experiment
(graph: sdimitro.github.io/img/linux-lsm/PercOfSyncInPassOne.png).
Sync times per TXG were 37.6 seconds on average for the stock
bits and 22.7 seconds for the log spacemap bits (related graph:
sdimitro.github.io/img/linux-lsm/SyncTimePerTXG.png). As a result
the log spacemap bits were able to push more TXGs, which is also
the reason why all graphs quantified per TXG have more entries for
the log spacemap bits.
Another interesting aspect in terms of txg syncs is that the stock
bits had 22% of their TXGs reach sync pass 7, 55% reach sync pass 8,
and 20% reach 9. The log space map bits reached sync pass 4 in 79%
of their TXGs, sync pass 7 in 19%, and sync pass 8 at 1%. This
emphasizes the fact that not only we spend less time on metadata
but we also iterate less times to convergence in spa_sync() dirtying
objects.
[related graphs:
stock- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGStock.png
lsm- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGLSM.png]
Finally, the improvement in IOPs that the userland gains from the
change is approximately 40%. There is a consistent win in IOPS as
you can see from the graphs below but the absolute amount of
improvement that the log spacemap gives varies within each minute
interval.
sdimitro.github.io/img/linux-lsm/StockVsLog3Days.png
sdimitro.github.io/img/linux-lsm/StockVsLog10Hours.png
= Porting to Other Platforms
For people that want to port this commit to other platforms below
is a list of ZoL commits that this patch depends on:
Make zdb results for checkpoint tests consistent
db587941c5ff6dea01932bb78f70db63cf7f38ba
Update vdev_is_spacemap_addressable() for new spacemap encoding
419ba5914552c6185afbe1dd17b3ed4b0d526547
Simplify spa_sync by breaking it up to smaller functions
8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834
Factor metaslab_load_wait() in metaslab_load()
b194fab0fb6caad18711abccaff3c69ad8b3f6d3
Rename range_tree_verify to range_tree_verify_not_present
df72b8bebe0ebac0b20e0750984bad182cb6564a
Change target size of metaslabs from 256GB to 16GB
c853f382db731e15a87512f4ef1101d14d778a55
zdb -L should skip leak detection altogether
21e7cf5da89f55ce98ec1115726b150e19eefe89
vs_alloc can underflow in L2ARC vdevs
7558997d2f808368867ca7e5234e5793446e8f3f
Simplify log vdev removal code
6c926f426a26ffb6d7d8e563e33fc176164175cb
Get rid of space_map_update() for ms_synced_length
425d3237ee88abc53d8522a7139c926d278b4b7f
Introduce auxiliary metaslab histograms
928e8ad47d3478a3d5d01f0dd6ae74a9371af65e
Error path in metaslab_load_impl() forgets to drop ms_sync_lock
8eef997679ba54547f7d361553d21b3291f41ae7
= References
Background, Motivation, and Internals of the Feature
- OpenZFS 2017 Presentation:
youtu.be/jj2IxRkl5bQ
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemaps-project
Flushing Algorithm Internals & Performance Results
(Illumos Specific)
- Blogpost:
sdimitro.github.io/post/zfs-lsm-flushing/
- OpenZFS 2018 Presentation:
youtu.be/x6D2dHRjkxw
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemap-flushing-algorithm
Upstream Delphix Issues:
DLPX-51539, DLPX-59659, DLPX-57783, DLPX-61438, DLPX-41227, DLPX-59320
DLPX-63385
Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8442
2019-07-16 20:11:49 +03:00
|
|
|
#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
|
|
|
|
"com.delphix:ms_unflushed_phys_txgs"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2020-07-03 21:05:50 +03:00
|
|
|
#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \
|
|
|
|
"org.openzfs:vdev_rebuild"
|
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
|
|
|
|
"org.zfsonlinux:allocation_bias"
|
|
|
|
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
#define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \
|
|
|
|
"org.openzfs:raidz_expand_state"
|
|
|
|
#define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \
|
|
|
|
"org.openzfs:raidz_expand_start_time"
|
|
|
|
#define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \
|
|
|
|
"org.openzfs:raidz_expand_end_time"
|
|
|
|
#define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \
|
|
|
|
"org.openzfs:raidz_expand_bytes_copied"
|
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
/* vdev metaslab allocation bias */
|
|
|
|
#define VDEV_ALLOC_BIAS_LOG "log"
|
|
|
|
#define VDEV_ALLOC_BIAS_SPECIAL "special"
|
|
|
|
#define VDEV_ALLOC_BIAS_DEDUP "dedup"
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/* vdev initialize state */
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
|
|
|
|
"com.delphix:next_offset_to_initialize"
|
|
|
|
#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
|
|
|
|
"com.delphix:vdev_initialize_state"
|
|
|
|
#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
|
|
|
|
"com.delphix:vdev_initialize_action_time"
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/* vdev TRIM state */
|
|
|
|
#define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \
|
|
|
|
"org.zfsonlinux:next_offset_to_trim"
|
|
|
|
#define VDEV_LEAF_ZAP_TRIM_STATE \
|
|
|
|
"org.zfsonlinux:vdev_trim_state"
|
|
|
|
#define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \
|
|
|
|
"org.zfsonlinux:vdev_trim_action_time"
|
|
|
|
#define VDEV_LEAF_ZAP_TRIM_RATE \
|
|
|
|
"org.zfsonlinux:vdev_trim_rate"
|
|
|
|
#define VDEV_LEAF_ZAP_TRIM_PARTIAL \
|
|
|
|
"org.zfsonlinux:vdev_trim_partial"
|
|
|
|
#define VDEV_LEAF_ZAP_TRIM_SECURE \
|
|
|
|
"org.zfsonlinux:vdev_trim_secure"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* This is needed in userland to report the minimum necessary device size.
|
|
|
|
*/
|
|
|
|
#define SPA_MINDEVSIZE (64ULL << 20)
|
|
|
|
|
2014-07-20 00:19:24 +04:00
|
|
|
/*
|
|
|
|
* Set if the fragmentation has not yet been calculated. This can happen
|
|
|
|
* because the space maps have not been upgraded or the histogram feature
|
|
|
|
* is not enabled.
|
|
|
|
*/
|
|
|
|
#define ZFS_FRAG_INVALID UINT64_MAX
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* The location of the pool configuration repository, shared between kernel and
|
|
|
|
* userland.
|
|
|
|
*/
|
2020-08-18 00:43:47 +03:00
|
|
|
#define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache"
|
2008-12-03 23:09:06 +03:00
|
|
|
#define ZPOOL_CACHE "/etc/zfs/zpool.cache"
|
2021-02-18 08:30:45 +03:00
|
|
|
/*
|
|
|
|
* Settings for zpool compatibility features files
|
|
|
|
*/
|
|
|
|
#define ZPOOL_SYSCONF_COMPAT_D SYSCONFDIR "/zfs/compatibility.d"
|
|
|
|
#define ZPOOL_DATA_COMPAT_D PKGDATADIR "/compatibility.d"
|
|
|
|
#define ZPOOL_COMPAT_MAXSIZE 16384
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hard-wired compatibility settings
|
|
|
|
*/
|
|
|
|
#define ZPOOL_COMPAT_LEGACY "legacy"
|
|
|
|
#define ZPOOL_COMPAT_OFF "off"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* vdev states are ordered from least to most healthy.
|
|
|
|
* A vdev that's CANT_OPEN or below is considered unusable.
|
|
|
|
*/
|
|
|
|
typedef enum vdev_state {
|
|
|
|
VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
|
|
|
|
VDEV_STATE_CLOSED, /* Not currently open */
|
|
|
|
VDEV_STATE_OFFLINE, /* Not allowed to open */
|
|
|
|
VDEV_STATE_REMOVED, /* Explicitly removed from system */
|
|
|
|
VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
|
|
|
|
VDEV_STATE_FAULTED, /* External request to fault device */
|
|
|
|
VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
|
|
|
|
VDEV_STATE_HEALTHY /* Presumed good */
|
|
|
|
} vdev_state_t;
|
|
|
|
|
|
|
|
#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
|
|
|
|
* of the vdev stats structure uses these constants to distinguish why.
|
|
|
|
*/
|
|
|
|
typedef enum vdev_aux {
|
|
|
|
VDEV_AUX_NONE, /* no error */
|
|
|
|
VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
|
|
|
|
VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
|
|
|
|
VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
|
|
|
|
VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
|
|
|
|
VDEV_AUX_TOO_SMALL, /* vdev size is too small */
|
|
|
|
VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
|
|
|
|
VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
|
|
|
|
VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
|
2012-12-14 03:24:15 +04:00
|
|
|
VDEV_AUX_UNSUP_FEAT, /* unsupported features */
|
2008-11-20 23:01:55 +03:00
|
|
|
VDEV_AUX_SPARED, /* hot spare used in another pool */
|
2008-12-03 23:09:06 +03:00
|
|
|
VDEV_AUX_ERR_EXCEEDED, /* too many errors */
|
|
|
|
VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
|
2010-05-29 00:45:14 +04:00
|
|
|
VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
|
2017-05-19 22:30:16 +03:00
|
|
|
VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */
|
2017-03-29 03:21:11 +03:00
|
|
|
VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
|
2017-05-19 22:30:16 +03:00
|
|
|
VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */
|
|
|
|
VDEV_AUX_ACTIVE, /* vdev active on a different host */
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */
|
2020-08-21 22:53:17 +03:00
|
|
|
VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */
|
2008-11-20 23:01:55 +03:00
|
|
|
} vdev_aux_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pool state. The following states are written to disk as part of the normal
|
|
|
|
* SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
|
|
|
|
* states are software abstractions used at various levels to communicate
|
|
|
|
* pool state.
|
|
|
|
*/
|
|
|
|
typedef enum pool_state {
|
|
|
|
POOL_STATE_ACTIVE = 0, /* In active use */
|
|
|
|
POOL_STATE_EXPORTED, /* Explicitly exported */
|
|
|
|
POOL_STATE_DESTROYED, /* Explicitly destroyed */
|
|
|
|
POOL_STATE_SPARE, /* Reserved for hot spare use */
|
|
|
|
POOL_STATE_L2CACHE, /* Level 2 ARC device */
|
|
|
|
POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
|
|
|
|
POOL_STATE_UNAVAIL, /* Internal libzfs state */
|
|
|
|
POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
|
|
|
|
} pool_state_t;
|
|
|
|
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
/*
|
|
|
|
* mmp state. The following states provide additional detail describing
|
|
|
|
* why a pool couldn't be safely imported.
|
|
|
|
*/
|
|
|
|
typedef enum mmp_state {
|
|
|
|
MMP_STATE_ACTIVE = 0, /* In active use */
|
|
|
|
MMP_STATE_INACTIVE, /* Inactive and safe to import */
|
|
|
|
MMP_STATE_NO_HOSTID /* System hostid is not set */
|
|
|
|
} mmp_state_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Scan Functions.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum pool_scan_func {
|
|
|
|
POOL_SCAN_NONE,
|
|
|
|
POOL_SCAN_SCRUB,
|
|
|
|
POOL_SCAN_RESILVER,
|
2021-12-17 23:35:28 +03:00
|
|
|
POOL_SCAN_ERRORSCRUB,
|
2010-05-29 00:45:14 +04:00
|
|
|
POOL_SCAN_FUNCS
|
|
|
|
} pool_scan_func_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-07-07 08:16:13 +03:00
|
|
|
/*
|
|
|
|
* Used to control scrub pause and resume.
|
|
|
|
*/
|
|
|
|
typedef enum pool_scrub_cmd {
|
|
|
|
POOL_SCRUB_NORMAL = 0,
|
|
|
|
POOL_SCRUB_PAUSE,
|
|
|
|
POOL_SCRUB_FLAGS_END
|
|
|
|
} pool_scrub_cmd_t;
|
|
|
|
|
2016-12-17 01:11:29 +03:00
|
|
|
typedef enum {
|
|
|
|
CS_NONE,
|
|
|
|
CS_CHECKPOINT_EXISTS,
|
|
|
|
CS_CHECKPOINT_DISCARDING,
|
|
|
|
CS_NUM_STATES
|
|
|
|
} checkpoint_state_t;
|
|
|
|
|
|
|
|
typedef struct pool_checkpoint_stat {
|
|
|
|
uint64_t pcs_state; /* checkpoint_state_t */
|
|
|
|
uint64_t pcs_start_time; /* time checkpoint/discard started */
|
|
|
|
uint64_t pcs_space; /* checkpointed space */
|
|
|
|
} pool_checkpoint_stat_t;
|
2017-07-07 08:16:13 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ZIO types. Needed to interpret vdev statistics below.
|
|
|
|
*/
|
|
|
|
typedef enum zio_type {
|
|
|
|
ZIO_TYPE_NULL = 0,
|
|
|
|
ZIO_TYPE_READ,
|
|
|
|
ZIO_TYPE_WRITE,
|
|
|
|
ZIO_TYPE_FREE,
|
|
|
|
ZIO_TYPE_CLAIM,
|
2024-04-04 14:35:00 +03:00
|
|
|
ZIO_TYPE_FLUSH,
|
2019-03-29 19:13:20 +03:00
|
|
|
ZIO_TYPE_TRIM,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZIO_TYPES
|
|
|
|
} zio_type_t;
|
|
|
|
|
2024-04-04 14:35:00 +03:00
|
|
|
/*
|
|
|
|
* Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to
|
|
|
|
* user programs.
|
|
|
|
*/
|
|
|
|
#define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Pool statistics. Note: all fields should be 64-bit because this
|
|
|
|
* is passed between kernel and userland as an nvlist uint64 array.
|
|
|
|
*/
|
|
|
|
typedef struct pool_scan_stat {
|
|
|
|
/* values stored on disk */
|
|
|
|
uint64_t pss_func; /* pool_scan_func_t */
|
|
|
|
uint64_t pss_state; /* dsl_scan_state_t */
|
|
|
|
uint64_t pss_start_time; /* scan start time */
|
|
|
|
uint64_t pss_end_time; /* scan end time */
|
|
|
|
uint64_t pss_to_examine; /* total bytes to scan */
|
2017-11-16 04:27:01 +03:00
|
|
|
uint64_t pss_examined; /* total bytes located by scanner */
|
Do not report bytes skipped by scan as issued.
Scan process may skip blocks based on their birth time, DVA, etc.
Traditionally those blocks were accounted as issued, that caused
reporting of hugely over-inflated numbers, having nothing to do
with actual disk I/O. This change utilizes never used field in
struct dsl_scan_phys to account such skipped bytes, allowing to
report how much data were actually scrubbed/resilvered and what
is the actual I/O speed. While formally it is an on-disk format
change, it should be compatible both ways, so should not need a
feature flag.
This should partially address the same issue as c85ac731a0e, but
from a different perspective, complementing it.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15007
2023-06-30 18:47:13 +03:00
|
|
|
uint64_t pss_skipped; /* total bytes skipped by scanner */
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t pss_processed; /* total processed bytes */
|
|
|
|
uint64_t pss_errors; /* scan errors */
|
|
|
|
|
|
|
|
/* values not stored on disk */
|
2017-11-16 04:27:01 +03:00
|
|
|
uint64_t pss_pass_exam; /* examined bytes per scan pass */
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t pss_pass_start; /* start time of a scan pass */
|
2019-08-30 19:53:15 +03:00
|
|
|
uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */
|
2017-07-07 08:16:13 +03:00
|
|
|
/* cumulative time scrub spent paused, needed for rate calculation */
|
|
|
|
uint64_t pss_pass_scrub_spent_paused;
|
2017-11-30 20:40:13 +03:00
|
|
|
uint64_t pss_pass_issued; /* issued bytes per scan pass */
|
2017-11-16 04:27:01 +03:00
|
|
|
uint64_t pss_issued; /* total bytes checked by scanner */
|
2021-12-17 23:35:28 +03:00
|
|
|
|
|
|
|
/* error scrub values stored on disk */
|
|
|
|
uint64_t pss_error_scrub_func; /* pool_scan_func_t */
|
|
|
|
uint64_t pss_error_scrub_state; /* dsl_scan_state_t */
|
|
|
|
uint64_t pss_error_scrub_start; /* error scrub start time */
|
|
|
|
uint64_t pss_error_scrub_end; /* error scrub end time */
|
|
|
|
uint64_t pss_error_scrub_examined; /* error blocks issued I/O */
|
|
|
|
/* error blocks to be issued I/O */
|
|
|
|
uint64_t pss_error_scrub_to_be_examined;
|
|
|
|
|
|
|
|
/* error scrub values not stored on disk */
|
|
|
|
/* error scrub pause time in milliseconds */
|
|
|
|
uint64_t pss_pass_error_scrub_pause;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
} pool_scan_stat_t;
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
typedef struct pool_removal_stat {
|
|
|
|
uint64_t prs_state; /* dsl_scan_state_t */
|
|
|
|
uint64_t prs_removing_vdev;
|
|
|
|
uint64_t prs_start_time;
|
|
|
|
uint64_t prs_end_time;
|
|
|
|
uint64_t prs_to_copy; /* bytes that need to be copied */
|
|
|
|
uint64_t prs_copied; /* bytes copied so far */
|
|
|
|
/*
|
|
|
|
* bytes of memory used for indirect mappings.
|
|
|
|
* This includes all removed vdevs.
|
|
|
|
*/
|
|
|
|
uint64_t prs_mapping_memory;
|
|
|
|
} pool_removal_stat_t;
|
|
|
|
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
typedef struct pool_raidz_expand_stat {
|
|
|
|
uint64_t pres_state; /* dsl_scan_state_t */
|
|
|
|
uint64_t pres_expanding_vdev;
|
|
|
|
uint64_t pres_start_time;
|
|
|
|
uint64_t pres_end_time;
|
|
|
|
uint64_t pres_to_reflow; /* bytes that need to be moved */
|
|
|
|
uint64_t pres_reflowed; /* bytes moved so far */
|
|
|
|
uint64_t pres_waiting_for_resilver;
|
|
|
|
} pool_raidz_expand_stat_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum dsl_scan_state {
|
|
|
|
DSS_NONE,
|
|
|
|
DSS_SCANNING,
|
|
|
|
DSS_FINISHED,
|
|
|
|
DSS_CANCELED,
|
2021-12-17 23:35:28 +03:00
|
|
|
DSS_ERRORSCRUBBING,
|
2010-05-29 00:45:14 +04:00
|
|
|
DSS_NUM_STATES
|
|
|
|
} dsl_scan_state_t;
|
|
|
|
|
2020-07-03 21:05:50 +03:00
|
|
|
typedef struct vdev_rebuild_stat {
|
|
|
|
uint64_t vrs_state; /* vdev_rebuild_state_t */
|
|
|
|
uint64_t vrs_start_time; /* time_t */
|
|
|
|
uint64_t vrs_end_time; /* time_t */
|
|
|
|
uint64_t vrs_scan_time_ms; /* total run time (millisecs) */
|
|
|
|
uint64_t vrs_bytes_scanned; /* allocated bytes scanned */
|
|
|
|
uint64_t vrs_bytes_issued; /* read bytes issued */
|
|
|
|
uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */
|
|
|
|
uint64_t vrs_bytes_est; /* total bytes to scan */
|
|
|
|
uint64_t vrs_errors; /* scanning errors */
|
|
|
|
uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */
|
|
|
|
uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
|
|
|
|
uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
|
Do not report bytes skipped by scan as issued.
Scan process may skip blocks based on their birth time, DVA, etc.
Traditionally those blocks were accounted as issued, that caused
reporting of hugely over-inflated numbers, having nothing to do
with actual disk I/O. This change utilizes never used field in
struct dsl_scan_phys to account such skipped bytes, allowing to
report how much data were actually scrubbed/resilvered and what
is the actual I/O speed. While formally it is an on-disk format
change, it should be compatible both ways, so should not need a
feature flag.
This should partially address the same issue as c85ac731a0e, but
from a different perspective, complementing it.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15007
2023-06-30 18:47:13 +03:00
|
|
|
uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */
|
2020-07-03 21:05:50 +03:00
|
|
|
} vdev_rebuild_stat_t;
|
|
|
|
|
2014-02-21 07:57:17 +04:00
|
|
|
/*
|
2020-08-27 07:43:06 +03:00
|
|
|
* Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER.
|
|
|
|
* The ordering of this enum must be maintained to ensure the errata identifiers
|
|
|
|
* map to the correct documentation. New errata may only be appended to the
|
|
|
|
* list and must contain corresponding documentation at the above link.
|
2014-02-21 07:57:17 +04:00
|
|
|
*/
|
|
|
|
typedef enum zpool_errata {
|
|
|
|
ZPOOL_ERRATA_NONE,
|
Add erratum for issue #2094
ZoL commit 1421c89 unintentionally changed the disk format in a forward-
compatible, but not backward compatible way. This was accomplished by
adding an entry to zbookmark_t, which is included in a couple of
on-disk structures. That lead to the creation of pools with incorrect
dsl_scan_phys_t objects that could only be imported by versions of ZoL
containing that commit. Such pools cannot be imported by other versions
of ZFS or past versions of ZoL.
The additional field has been removed by the previous commit. However,
affected pools must be imported and scrubbed using a version of ZoL with
this commit applied. This will return the pools to a state in which they
may be imported by other implementations.
The 'zpool import' or 'zpool status' command can be used to determine if
a pool is impacted. A message similar to one of the following means your
pool must be scrubbed to restore compatibility.
$ zpool import
pool: zol-0.6.2-173
id: 1165955789558693437
state: ONLINE
status: Errata #1 detected.
action: The pool can be imported using its name or numeric identifier,
however there is a compatibility issue which should be corrected
by running 'zpool scrub'
see: http://zfsonlinux.org/msg/ZFS-8000-ER
config:
...
$ zpool status
pool: zol-0.6.2-173
state: ONLINE
scan: pool compatibility issue detected.
see: https://github.com/zfsonlinux/zfs/issues/2094
action: To correct the issue run 'zpool scrub'.
config:
...
If there was an async destroy in progress 'zpool import' will prevent
the pool from being imported. Further advice on how to proceed will be
provided by the error message as follows.
$ zpool import
pool: zol-0.6.2-173
id: 1165955789558693437
state: ONLINE
status: Errata #2 detected.
action: The pool can not be imported with this version of ZFS due to an
active asynchronous destroy. Revert to an earlier version and
allow the destroy to complete before updating.
see: http://zfsonlinux.org/msg/ZFS-8000-ER
config:
...
Pools affected by the damaged dsl_scan_phys_t can be detected prior to
an upgrade by running the following command as root:
zdb -dddd poolname 1 | grep -P '^\t\tscan = ' | sed -e 's;scan = ;;' | wc -w
Note that `poolname` must be replaced with the name of the pool you wish
to check. A value of 25 indicates the dsl_scan_phys_t has been damaged.
A value of 24 indicates that the dsl_scan_phys_t is normal. A value of 0
indicates that there has never been a scrub run on the pool.
The regression caused by the change to zbookmark_t never made it into a
tagged release, Gentoo backports, Ubuntu, Debian, Fedora, or EPEL
stable respositorys. Only those using the HEAD version directly from
Github after the 0.6.2 but before the 0.6.3 tag are affected.
This patch does have one limitation that should be mentioned. It will not
detect errata #2 on a pool unless errata #1 is also present. It expected
this will not be a significant problem because pools impacted by errata #2
have a high probably of being impacted by errata #1.
End users can ensure they do no hit this unlikely case by waiting for all
asynchronous destroy operations to complete before updating ZoL. The
presence of any background destroys on any imported pools can be checked
by running `zpool get freeing` as root. This will display a non-zero
value for any pool with an active asynchronous destroy.
Lastly, it is expected that no user data has been lost as a result of
this erratum.
Original-patch-by: Tim Chase <tim@chase2k.com>
Reworked-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2094
2014-02-21 08:28:33 +04:00
|
|
|
ZPOOL_ERRATA_ZOL_2094_SCRUB,
|
|
|
|
ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY,
|
2017-11-08 22:12:59 +03:00
|
|
|
ZPOOL_ERRATA_ZOL_6845_ENCRYPTION,
|
2019-02-04 22:24:55 +03:00
|
|
|
ZPOOL_ERRATA_ZOL_8308_ENCRYPTION,
|
2014-02-21 07:57:17 +04:00
|
|
|
} zpool_errata_t;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Vdev statistics. Note: all fields should be 64-bit because this
|
2019-03-29 19:13:20 +03:00
|
|
|
* is passed between kernel and user land as an nvlist uint64 array.
|
|
|
|
*
|
|
|
|
* The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in
|
|
|
|
* order to keep subsequent members at their known fixed offsets. When
|
|
|
|
* adding a new field it must be added to the end the structure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2019-03-29 19:13:20 +03:00
|
|
|
#define VS_ZIO_TYPES 6
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef struct vdev_stat {
|
|
|
|
hrtime_t vs_timestamp; /* time since vdev load */
|
|
|
|
uint64_t vs_state; /* vdev state */
|
|
|
|
uint64_t vs_aux; /* see vdev_aux_t */
|
|
|
|
uint64_t vs_alloc; /* space allocated */
|
|
|
|
uint64_t vs_space; /* total capacity */
|
|
|
|
uint64_t vs_dspace; /* deflated capacity */
|
|
|
|
uint64_t vs_rsize; /* replaceable dev size */
|
2012-01-24 06:43:32 +04:00
|
|
|
uint64_t vs_esize; /* expandable dev size */
|
2019-03-29 19:13:20 +03:00
|
|
|
uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */
|
|
|
|
uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t vs_read_errors; /* read errors */
|
|
|
|
uint64_t vs_write_errors; /* write errors */
|
|
|
|
uint64_t vs_checksum_errors; /* checksum errors */
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
uint64_t vs_initialize_errors; /* initializing errors */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t vs_self_healed; /* self-healed bytes */
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t vs_scan_removing; /* removing? */
|
|
|
|
uint64_t vs_scan_processed; /* scan processed bytes */
|
2014-07-20 00:19:24 +04:00
|
|
|
uint64_t vs_fragmentation; /* device fragmentation */
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
uint64_t vs_initialize_bytes_done; /* bytes initialized */
|
|
|
|
uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
|
2019-08-30 19:53:15 +03:00
|
|
|
uint64_t vs_initialize_state; /* vdev_initializing_state_t */
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
uint64_t vs_initialize_action_time; /* time_t */
|
2016-12-17 01:11:29 +03:00
|
|
|
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
|
2018-10-19 07:06:18 +03:00
|
|
|
uint64_t vs_resilver_deferred; /* resilver deferred */
|
2018-11-09 03:47:24 +03:00
|
|
|
uint64_t vs_slow_ios; /* slow IOs */
|
2019-03-29 19:13:20 +03:00
|
|
|
uint64_t vs_trim_errors; /* trimming errors */
|
|
|
|
uint64_t vs_trim_notsup; /* supported by device */
|
|
|
|
uint64_t vs_trim_bytes_done; /* bytes trimmed */
|
|
|
|
uint64_t vs_trim_bytes_est; /* total bytes to trim */
|
|
|
|
uint64_t vs_trim_state; /* vdev_trim_state_t */
|
|
|
|
uint64_t vs_trim_action_time; /* time_t */
|
2020-07-03 21:05:50 +03:00
|
|
|
uint64_t vs_rebuild_processed; /* bytes rebuilt */
|
2020-08-21 22:53:17 +03:00
|
|
|
uint64_t vs_configured_ashift; /* TLV vdev_ashift */
|
|
|
|
uint64_t vs_logical_ashift; /* vdev_logical_ashift */
|
|
|
|
uint64_t vs_physical_ashift; /* vdev_physical_ashift */
|
2021-11-30 17:46:25 +03:00
|
|
|
uint64_t vs_noalloc; /* allocations halted? */
|
2022-03-09 03:20:41 +03:00
|
|
|
uint64_t vs_pspace; /* physical capacity */
|
Adding Direct IO Support
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads.
O_DIRECT support in ZFS will always ensure there is coherency between
buffered and O_DIRECT IO requests. This ensures that all IO requests,
whether buffered or direct, will see the same file contents at all
times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While
data is written directly to VDEV disks, metadata will not be synced
until the associated TXG is synced.
For both O_DIRECT read and write request the offset and request sizes,
at a minimum, must be PAGE_SIZE aligned. In the event they are not,
then EINVAL is returned unless the direct property is set to always (see
below).
For O_DIRECT writes:
The request also must be block aligned (recordsize) or the write
request will take the normal (buffered) write path. In the event that
request is block aligned and a cached copy of the buffer in the ARC,
then it will be discarded from the ARC forcing all further reads to
retrieve the data from disk.
For O_DIRECT reads:
The only alignment restrictions are PAGE_SIZE alignment. In the event
that the requested data is in buffered (in the ARC) it will just be
copied from the ARC into the user buffer.
For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in
the event that file contents are mmap'ed. In this case, all requests
that are at least PAGE_SIZE aligned will just fall back to the buffered
paths. If the request however is not PAGE_SIZE aligned, EINVAL will
be returned as always regardless if the file's contents are mmap'ed.
Since O_DIRECT writes go through the normal ZIO pipeline, the
following operations are supported just as with normal buffered writes:
Checksum
Compression
Encryption
Erasure Coding
There is one caveat for the data integrity of O_DIRECT writes that is
distinct for each of the OS's supported by ZFS.
FreeBSD - FreeBSD is able to place user pages under write protection so
any data in the user buffers and written directly down to the
VDEV disks is guaranteed to not change. There is no concern
with data integrity and O_DIRECT writes.
Linux - Linux is not able to place anonymous user pages under write
protection. Because of this, if the user decides to manipulate
the page contents while the write operation is occurring, data
integrity can not be guaranteed. However, there is a module
parameter `zfs_vdev_direct_write_verify` that controls the
if a O_DIRECT writes that can occur to a top-level VDEV before
a checksum verify is run before the contents of the I/O buffer
are committed to disk. In the event of a checksum verification
failure the write will return EIO. The number of O_DIRECT write
checksum verification errors can be observed by doing
`zpool status -d`, which will list all verification errors that
have occurred on a top-level VDEV. Along with `zpool status`, a
ZED event will be issues as `dio_verify` when a checksum
verification error occurs.
ZVOLs and dedup is not currently supported with Direct I/O.
A new dataset property `direct` has been added with the following 3
allowable values:
disabled - Accepts O_DIRECT flag, but silently ignores it and treats
the request as a buffered IO request.
standard - Follows the alignment restrictions outlined above for
write/read IO requests when the O_DIRECT flag is used.
always - Treats every write/read IO request as though it passed
O_DIRECT and will do O_DIRECT if the alignment restrictions
are met otherwise will redirect through the ARC. This
property will not allow a request to fail.
There is also a module parameter zfs_dio_enabled that can be used to
force all reads and writes through the ARC. By setting this module
parameter to 0, it mimics as if the direct dataset property is set to
disabled.
Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Co-authored-by: Mark Maybee <mark.maybee@delphix.com>
Co-authored-by: Matt Macy <mmacy@FreeBSD.org>
Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov>
Closes #10018
2024-09-14 23:47:59 +03:00
|
|
|
uint64_t vs_dio_verify_errors; /* DIO write verify errors */
|
2008-11-20 23:01:55 +03:00
|
|
|
} vdev_stat_t;
|
|
|
|
|
2020-08-21 22:53:17 +03:00
|
|
|
#define VDEV_STAT_VALID(field, uint64_t_field_count) \
|
2022-01-21 19:07:15 +03:00
|
|
|
((uint64_t_field_count * sizeof (uint64_t)) >= \
|
|
|
|
(offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field)))
|
2020-08-21 22:53:17 +03:00
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
/*
|
|
|
|
* Extended stats
|
|
|
|
*
|
|
|
|
* These are stats which aren't included in the original iostat output. For
|
|
|
|
* convenience, they are grouped together in vdev_stat_ex, although each stat
|
2016-08-08 11:00:08 +03:00
|
|
|
* is individually exported as an nvlist.
|
2016-02-29 21:05:23 +03:00
|
|
|
*/
|
|
|
|
typedef struct vdev_stat_ex {
|
|
|
|
/* Number of ZIOs issued to disk and waiting to finish */
|
|
|
|
uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
|
|
|
|
|
|
|
|
/* Number of ZIOs pending to be issued to disk */
|
|
|
|
uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Below are the histograms for various latencies. Buckets are in
|
|
|
|
* units of nanoseconds.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 2^37 nanoseconds = 134s. Timeouts will probably start kicking in
|
|
|
|
* before this.
|
|
|
|
*/
|
2016-05-26 00:21:35 +03:00
|
|
|
#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */
|
|
|
|
#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */
|
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
/* Amount of time in ZIO queue (ns) */
|
|
|
|
uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
|
2016-05-26 00:21:35 +03:00
|
|
|
[VDEV_L_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Total ZIO latency (ns). Includes queuing and disk access time */
|
2016-05-26 00:21:35 +03:00
|
|
|
uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Amount of time to read/write the disk (ns) */
|
2016-05-26 00:21:35 +03:00
|
|
|
uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
|
|
|
|
|
|
|
|
/* "lookup the bucket for a value" histogram macros */
|
|
|
|
#define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \
|
|
|
|
buckets - 1) : 0)
|
|
|
|
#define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS)
|
|
|
|
#define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS)
|
|
|
|
|
|
|
|
/* Physical IO histogram */
|
|
|
|
uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
|
|
|
|
[VDEV_RQ_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
2016-05-26 00:21:35 +03:00
|
|
|
/* Delegated (aggregated) physical IO histogram */
|
|
|
|
uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
|
|
|
|
[VDEV_RQ_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
} vdev_stat_ex_t;
|
|
|
|
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
/*
|
|
|
|
* Initialize functions.
|
|
|
|
*/
|
|
|
|
typedef enum pool_initialize_func {
|
2019-03-29 19:13:20 +03:00
|
|
|
POOL_INITIALIZE_START,
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
POOL_INITIALIZE_CANCEL,
|
|
|
|
POOL_INITIALIZE_SUSPEND,
|
2023-05-18 20:02:20 +03:00
|
|
|
POOL_INITIALIZE_UNINIT,
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
POOL_INITIALIZE_FUNCS
|
|
|
|
} pool_initialize_func_t;
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/*
|
|
|
|
* TRIM functions.
|
|
|
|
*/
|
|
|
|
typedef enum pool_trim_func {
|
|
|
|
POOL_TRIM_START,
|
|
|
|
POOL_TRIM_CANCEL,
|
|
|
|
POOL_TRIM_SUSPEND,
|
|
|
|
POOL_TRIM_FUNCS
|
|
|
|
} pool_trim_func_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* DDT statistics. Note: all fields should be 64-bit because this
|
|
|
|
* is passed between kernel and userland as an nvlist uint64 array.
|
|
|
|
*/
|
|
|
|
typedef struct ddt_object {
|
2019-03-12 23:13:22 +03:00
|
|
|
uint64_t ddo_count; /* number of elements in ddt */
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t ddo_dspace; /* size of ddt on disk */
|
|
|
|
uint64_t ddo_mspace; /* size of ddt in-core */
|
|
|
|
} ddt_object_t;
|
|
|
|
|
|
|
|
typedef struct ddt_stat {
|
|
|
|
uint64_t dds_blocks; /* blocks */
|
|
|
|
uint64_t dds_lsize; /* logical size */
|
|
|
|
uint64_t dds_psize; /* physical size */
|
|
|
|
uint64_t dds_dsize; /* deflated allocated size */
|
|
|
|
uint64_t dds_ref_blocks; /* referenced blocks */
|
|
|
|
uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
|
|
|
|
uint64_t dds_ref_psize; /* referenced psize * refcnt */
|
|
|
|
uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
|
|
|
|
} ddt_stat_t;
|
|
|
|
|
|
|
|
typedef struct ddt_histogram {
|
|
|
|
ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
|
|
|
|
} ddt_histogram_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZVOL_DRIVER "zvol"
|
|
|
|
#define ZFS_DRIVER "zfs"
|
|
|
|
#define ZFS_DEV "/dev/zfs"
|
2022-04-15 00:30:41 +03:00
|
|
|
#define ZFS_DEVDIR "/dev"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-01-26 23:28:29 +03:00
|
|
|
#define ZFS_SUPER_MAGIC 0x2fc12fc1
|
|
|
|
|
2020-08-11 23:12:12 +03:00
|
|
|
/* general zvol path */
|
|
|
|
#define ZVOL_DIR "/dev/zvol/"
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
#define ZVOL_MAJOR 230
|
|
|
|
#define ZVOL_MINOR_BITS 4
|
|
|
|
#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1)
|
|
|
|
#define ZVOL_MINORS (1 << 4)
|
2011-02-22 13:58:44 +03:00
|
|
|
#define ZVOL_DEV_NAME "zd"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#define ZVOL_PROP_NAME "name"
|
2021-08-17 18:59:46 +03:00
|
|
|
#define ZVOL_DEFAULT_BLOCKSIZE 16384
|
2008-11-20 23:01:55 +03:00
|
|
|
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
typedef enum {
|
|
|
|
VDEV_INITIALIZE_NONE,
|
|
|
|
VDEV_INITIALIZE_ACTIVE,
|
|
|
|
VDEV_INITIALIZE_CANCELED,
|
|
|
|
VDEV_INITIALIZE_SUSPENDED,
|
|
|
|
VDEV_INITIALIZE_COMPLETE
|
|
|
|
} vdev_initializing_state_t;
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
typedef enum {
|
|
|
|
VDEV_TRIM_NONE,
|
|
|
|
VDEV_TRIM_ACTIVE,
|
|
|
|
VDEV_TRIM_CANCELED,
|
|
|
|
VDEV_TRIM_SUSPENDED,
|
|
|
|
VDEV_TRIM_COMPLETE,
|
|
|
|
} vdev_trim_state_t;
|
|
|
|
|
2020-07-03 21:05:50 +03:00
|
|
|
typedef enum {
|
|
|
|
VDEV_REBUILD_NONE,
|
|
|
|
VDEV_REBUILD_ACTIVE,
|
|
|
|
VDEV_REBUILD_CANCELED,
|
|
|
|
VDEV_REBUILD_COMPLETE,
|
|
|
|
} vdev_rebuild_state_t;
|
|
|
|
|
2019-03-12 23:13:22 +03:00
|
|
|
/*
|
|
|
|
* nvlist name constants. Facilitate restricting snapshot iteration range for
|
|
|
|
* the "list next snapshot" ioctl
|
|
|
|
*/
|
|
|
|
#define SNAP_ITER_MIN_TXG "snap_iter_min_txg"
|
|
|
|
#define SNAP_ITER_MAX_TXG "snap_iter_max_txg"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* /dev/zfs ioctl numbers.
|
Add basic zfs ioc input nvpair validation
We want newer versions of libzfs_core to run against an existing
zfs kernel module (i.e. a deferred reboot or module reload after
an update).
Programmatically document, via a zfs_ioc_key_t, the valid arguments
for the ioc commands that rely on nvpair input arguments (i.e. non
legacy commands from libzfs_core). Automatically verify the expected
pairs before dispatching a command.
This initial phase focuses on the non-legacy ioctls. A follow-on
change can address the legacy ioctl input from the zfs_cmd_t.
The zfs_ioc_key_t for zfs_keys_channel_program looks like:
static const zfs_ioc_key_t zfs_keys_channel_program[] = {
{"program", DATA_TYPE_STRING, 0},
{"arg", DATA_TYPE_UNKNOWN, 0},
{"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
{"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
{"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
Introduce four input errors to identify specific input failures
(in addition to generic argument value errors like EINVAL, ERANGE,
EBADF, and E2BIG).
ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #7780
2018-09-02 22:14:01 +03:00
|
|
|
*
|
|
|
|
* These numbers cannot change over time. New ioctl numbers must be appended.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
typedef enum zfs_ioc {
|
2013-12-14 02:49:33 +04:00
|
|
|
/*
|
2024-06-18 01:35:18 +03:00
|
|
|
* Core features - 89/128 numbers reserved.
|
2013-12-14 02:49:33 +04:00
|
|
|
*/
|
2019-12-06 00:06:51 +03:00
|
|
|
#ifdef __FreeBSD__
|
|
|
|
ZFS_IOC_FIRST = 0,
|
|
|
|
#else
|
2013-08-28 15:45:09 +04:00
|
|
|
ZFS_IOC_FIRST = ('Z' << 8),
|
2019-12-06 00:06:51 +03:00
|
|
|
#endif
|
2013-08-28 15:45:09 +04:00
|
|
|
ZFS_IOC = ZFS_IOC_FIRST,
|
Add basic zfs ioc input nvpair validation
We want newer versions of libzfs_core to run against an existing
zfs kernel module (i.e. a deferred reboot or module reload after
an update).
Programmatically document, via a zfs_ioc_key_t, the valid arguments
for the ioc commands that rely on nvpair input arguments (i.e. non
legacy commands from libzfs_core). Automatically verify the expected
pairs before dispatching a command.
This initial phase focuses on the non-legacy ioctls. A follow-on
change can address the legacy ioctl input from the zfs_cmd_t.
The zfs_ioc_key_t for zfs_keys_channel_program looks like:
static const zfs_ioc_key_t zfs_keys_channel_program[] = {
{"program", DATA_TYPE_STRING, 0},
{"arg", DATA_TYPE_UNKNOWN, 0},
{"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
{"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
{"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
Introduce four input errors to identify specific input failures
(in addition to generic argument value errors like EINVAL, ERANGE,
EBADF, and E2BIG).
ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #7780
2018-09-02 22:14:01 +03:00
|
|
|
ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */
|
|
|
|
ZFS_IOC_POOL_DESTROY, /* 0x5a01 */
|
|
|
|
ZFS_IOC_POOL_IMPORT, /* 0x5a02 */
|
|
|
|
ZFS_IOC_POOL_EXPORT, /* 0x5a03 */
|
|
|
|
ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */
|
|
|
|
ZFS_IOC_POOL_STATS, /* 0x5a05 */
|
|
|
|
ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */
|
|
|
|
ZFS_IOC_POOL_SCAN, /* 0x5a07 */
|
|
|
|
ZFS_IOC_POOL_FREEZE, /* 0x5a08 */
|
|
|
|
ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */
|
|
|
|
ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */
|
|
|
|
ZFS_IOC_VDEV_ADD, /* 0x5a0b */
|
|
|
|
ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */
|
|
|
|
ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */
|
|
|
|
ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */
|
|
|
|
ZFS_IOC_VDEV_DETACH, /* 0x5a0f */
|
|
|
|
ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */
|
|
|
|
ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */
|
|
|
|
ZFS_IOC_OBJSET_STATS, /* 0x5a12 */
|
|
|
|
ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */
|
|
|
|
ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */
|
|
|
|
ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */
|
|
|
|
ZFS_IOC_SET_PROP, /* 0x5a16 */
|
|
|
|
ZFS_IOC_CREATE, /* 0x5a17 */
|
|
|
|
ZFS_IOC_DESTROY, /* 0x5a18 */
|
|
|
|
ZFS_IOC_ROLLBACK, /* 0x5a19 */
|
|
|
|
ZFS_IOC_RENAME, /* 0x5a1a */
|
|
|
|
ZFS_IOC_RECV, /* 0x5a1b */
|
|
|
|
ZFS_IOC_SEND, /* 0x5a1c */
|
|
|
|
ZFS_IOC_INJECT_FAULT, /* 0x5a1d */
|
|
|
|
ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */
|
|
|
|
ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */
|
|
|
|
ZFS_IOC_ERROR_LOG, /* 0x5a20 */
|
|
|
|
ZFS_IOC_CLEAR, /* 0x5a21 */
|
|
|
|
ZFS_IOC_PROMOTE, /* 0x5a22 */
|
|
|
|
ZFS_IOC_SNAPSHOT, /* 0x5a23 */
|
|
|
|
ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */
|
|
|
|
ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */
|
|
|
|
ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */
|
|
|
|
ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */
|
|
|
|
ZFS_IOC_SET_FSACL, /* 0x5a28 */
|
|
|
|
ZFS_IOC_GET_FSACL, /* 0x5a29 */
|
|
|
|
ZFS_IOC_SHARE, /* 0x5a2a */
|
|
|
|
ZFS_IOC_INHERIT_PROP, /* 0x5a2b */
|
|
|
|
ZFS_IOC_SMB_ACL, /* 0x5a2c */
|
|
|
|
ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */
|
|
|
|
ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */
|
|
|
|
ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */
|
|
|
|
ZFS_IOC_HOLD, /* 0x5a30 */
|
|
|
|
ZFS_IOC_RELEASE, /* 0x5a31 */
|
|
|
|
ZFS_IOC_GET_HOLDS, /* 0x5a32 */
|
|
|
|
ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */
|
|
|
|
ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */
|
|
|
|
ZFS_IOC_NEXT_OBJ, /* 0x5a35 */
|
|
|
|
ZFS_IOC_DIFF, /* 0x5a36 */
|
|
|
|
ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */
|
|
|
|
ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */
|
|
|
|
ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */
|
|
|
|
ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */
|
|
|
|
ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */
|
|
|
|
ZFS_IOC_POOL_REGUID, /* 0x5a3c */
|
|
|
|
ZFS_IOC_POOL_REOPEN, /* 0x5a3d */
|
|
|
|
ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */
|
|
|
|
ZFS_IOC_LOG_HISTORY, /* 0x5a3f */
|
|
|
|
ZFS_IOC_SEND_NEW, /* 0x5a40 */
|
|
|
|
ZFS_IOC_SEND_SPACE, /* 0x5a41 */
|
|
|
|
ZFS_IOC_CLONE, /* 0x5a42 */
|
|
|
|
ZFS_IOC_BOOKMARK, /* 0x5a43 */
|
|
|
|
ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */
|
|
|
|
ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */
|
2019-03-10 00:39:31 +03:00
|
|
|
ZFS_IOC_RECV_NEW, /* 0x5a46 */
|
|
|
|
ZFS_IOC_POOL_SYNC, /* 0x5a47 */
|
|
|
|
ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a48 */
|
Add basic zfs ioc input nvpair validation
We want newer versions of libzfs_core to run against an existing
zfs kernel module (i.e. a deferred reboot or module reload after
an update).
Programmatically document, via a zfs_ioc_key_t, the valid arguments
for the ioc commands that rely on nvpair input arguments (i.e. non
legacy commands from libzfs_core). Automatically verify the expected
pairs before dispatching a command.
This initial phase focuses on the non-legacy ioctls. A follow-on
change can address the legacy ioctl input from the zfs_cmd_t.
The zfs_ioc_key_t for zfs_keys_channel_program looks like:
static const zfs_ioc_key_t zfs_keys_channel_program[] = {
{"program", DATA_TYPE_STRING, 0},
{"arg", DATA_TYPE_UNKNOWN, 0},
{"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
{"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
{"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
Introduce four input errors to identify specific input failures
(in addition to generic argument value errors like EINVAL, ERANGE,
EBADF, and E2BIG).
ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #7780
2018-09-02 22:14:01 +03:00
|
|
|
ZFS_IOC_LOAD_KEY, /* 0x5a49 */
|
|
|
|
ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */
|
|
|
|
ZFS_IOC_CHANGE_KEY, /* 0x5a4b */
|
|
|
|
ZFS_IOC_REMAP, /* 0x5a4c */
|
|
|
|
ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */
|
|
|
|
ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */
|
2019-03-29 19:13:20 +03:00
|
|
|
ZFS_IOC_POOL_TRIM, /* 0x5a50 */
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
ZFS_IOC_REDACT, /* 0x5a51 */
|
|
|
|
ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
ZFS_IOC_WAIT, /* 0x5a53 */
|
2020-04-01 20:02:06 +03:00
|
|
|
ZFS_IOC_WAIT_FS, /* 0x5a54 */
|
2021-11-30 17:46:25 +03:00
|
|
|
ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */
|
|
|
|
ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */
|
2021-12-17 23:35:28 +03:00
|
|
|
ZFS_IOC_POOL_SCRUB, /* 0x5a57 */
|
2024-07-26 19:16:18 +03:00
|
|
|
ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */
|
2024-06-18 01:35:18 +03:00
|
|
|
ZFS_IOC_DDT_PRUNE, /* 0x5a59 */
|
2013-12-14 02:49:33 +04:00
|
|
|
|
|
|
|
/*
|
2020-05-07 19:36:33 +03:00
|
|
|
* Per-platform (Optional) - 8/128 numbers reserved.
|
2013-12-14 02:49:33 +04:00
|
|
|
*/
|
2019-12-06 00:06:51 +03:00
|
|
|
ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80,
|
|
|
|
ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */
|
|
|
|
ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */
|
|
|
|
ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */
|
|
|
|
ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */
|
|
|
|
ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */
|
2021-02-21 19:19:43 +03:00
|
|
|
ZFS_IOC_USERNS_ATTACH = ZFS_IOC_JAIL, /* 0x85 (Linux) */
|
2019-12-06 00:06:51 +03:00
|
|
|
ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */
|
2021-02-21 19:19:43 +03:00
|
|
|
ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */
|
2020-09-16 01:42:27 +03:00
|
|
|
ZFS_IOC_SET_BOOTENV, /* 0x87 */
|
|
|
|
ZFS_IOC_GET_BOOTENV, /* 0x88 */
|
2013-08-28 15:45:09 +04:00
|
|
|
ZFS_IOC_LAST
|
2008-11-20 23:01:55 +03:00
|
|
|
} zfs_ioc_t;
|
|
|
|
|
2011-02-22 13:58:44 +03:00
|
|
|
/*
|
|
|
|
* zvol ioctl to get dataset name
|
|
|
|
*/
|
2016-06-16 00:28:36 +03:00
|
|
|
#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN])
|
2011-02-22 13:58:44 +03:00
|
|
|
|
Expose additional file level attributes
ZFS allows to update and retrieve additional file level attributes for
FreeBSD. This commit allows additional file level attributes to be
updated and retrieved for Linux. These include the flags stored in the
upper half of z_pflags only.
Two new IOCTLs have been added for this purpose. ZFS_IOC_GETDOSFLAGS
can be used to retrieve the attributes, while ZFS_IOC_SETDOSFLAGS can
be used to update the attributes.
Attributes that are allowed to be updated include ZFS_IMMUTABLE,
ZFS_APPENDONLY, ZFS_NOUNLINK, ZFS_ARCHIVE, ZFS_NODUMP, ZFS_SYSTEM,
ZFS_HIDDEN, ZFS_READONLY, ZFS_REPARSE, ZFS_OFFLINE and ZFS_SPARSE.
Flags can be or'd together while calling ZFS_IOC_SETDOSFLAGS.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #13118
2022-03-08 04:52:03 +03:00
|
|
|
#ifdef __linux__
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IOCTLs to update and retrieve additional file level attributes on
|
|
|
|
* Linux.
|
|
|
|
*/
|
|
|
|
#define ZFS_IOC_GETDOSFLAGS _IOR(0x83, 1, uint64_t)
|
|
|
|
#define ZFS_IOC_SETDOSFLAGS _IOW(0x83, 2, uint64_t)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Additional file level attributes, that are stored
|
|
|
|
* in the upper half of z_pflags
|
|
|
|
*/
|
|
|
|
#define ZFS_READONLY 0x0000000100000000ull
|
|
|
|
#define ZFS_HIDDEN 0x0000000200000000ull
|
|
|
|
#define ZFS_SYSTEM 0x0000000400000000ull
|
|
|
|
#define ZFS_ARCHIVE 0x0000000800000000ull
|
|
|
|
#define ZFS_IMMUTABLE 0x0000001000000000ull
|
|
|
|
#define ZFS_NOUNLINK 0x0000002000000000ull
|
|
|
|
#define ZFS_APPENDONLY 0x0000004000000000ull
|
|
|
|
#define ZFS_NODUMP 0x0000008000000000ull
|
|
|
|
#define ZFS_OPAQUE 0x0000010000000000ull
|
|
|
|
#define ZFS_AV_QUARANTINED 0x0000020000000000ull
|
|
|
|
#define ZFS_AV_MODIFIED 0x0000040000000000ull
|
|
|
|
#define ZFS_REPARSE 0x0000080000000000ull
|
|
|
|
#define ZFS_OFFLINE 0x0000100000000000ull
|
|
|
|
#define ZFS_SPARSE 0x0000200000000000ull
|
|
|
|
|
|
|
|
#define ZFS_DOS_FL_USER_VISIBLE (ZFS_IMMUTABLE | ZFS_APPENDONLY | \
|
|
|
|
ZFS_NOUNLINK | ZFS_ARCHIVE | ZFS_NODUMP | ZFS_SYSTEM | \
|
|
|
|
ZFS_HIDDEN | ZFS_READONLY | ZFS_REPARSE | ZFS_OFFLINE | \
|
|
|
|
ZFS_SPARSE)
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2016-12-17 01:11:29 +03:00
|
|
|
/*
|
|
|
|
* ZFS-specific error codes used for returning descriptive errors
|
|
|
|
* to the userland through zfs ioctls.
|
|
|
|
*
|
|
|
|
* The enum implicitly includes all the error codes from errno.h.
|
|
|
|
* New code should use and extend this enum for errors that are
|
|
|
|
* not described precisely by generic errno codes.
|
Add basic zfs ioc input nvpair validation
We want newer versions of libzfs_core to run against an existing
zfs kernel module (i.e. a deferred reboot or module reload after
an update).
Programmatically document, via a zfs_ioc_key_t, the valid arguments
for the ioc commands that rely on nvpair input arguments (i.e. non
legacy commands from libzfs_core). Automatically verify the expected
pairs before dispatching a command.
This initial phase focuses on the non-legacy ioctls. A follow-on
change can address the legacy ioctl input from the zfs_cmd_t.
The zfs_ioc_key_t for zfs_keys_channel_program looks like:
static const zfs_ioc_key_t zfs_keys_channel_program[] = {
{"program", DATA_TYPE_STRING, 0},
{"arg", DATA_TYPE_UNKNOWN, 0},
{"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
{"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
{"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
Introduce four input errors to identify specific input failures
(in addition to generic argument value errors like EINVAL, ERANGE,
EBADF, and E2BIG).
ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #7780
2018-09-02 22:14:01 +03:00
|
|
|
*
|
|
|
|
* These numbers should not change over time. New entries should be appended.
|
2019-11-11 10:24:14 +03:00
|
|
|
*
|
|
|
|
* (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py)
|
2016-12-17 01:11:29 +03:00
|
|
|
*/
|
|
|
|
typedef enum {
|
|
|
|
ZFS_ERR_CHECKPOINT_EXISTS = 1024,
|
|
|
|
ZFS_ERR_DISCARDING_CHECKPOINT,
|
|
|
|
ZFS_ERR_NO_CHECKPOINT,
|
|
|
|
ZFS_ERR_DEVRM_IN_PROGRESS,
|
Add basic zfs ioc input nvpair validation
We want newer versions of libzfs_core to run against an existing
zfs kernel module (i.e. a deferred reboot or module reload after
an update).
Programmatically document, via a zfs_ioc_key_t, the valid arguments
for the ioc commands that rely on nvpair input arguments (i.e. non
legacy commands from libzfs_core). Automatically verify the expected
pairs before dispatching a command.
This initial phase focuses on the non-legacy ioctls. A follow-on
change can address the legacy ioctl input from the zfs_cmd_t.
The zfs_ioc_key_t for zfs_keys_channel_program looks like:
static const zfs_ioc_key_t zfs_keys_channel_program[] = {
{"program", DATA_TYPE_STRING, 0},
{"arg", DATA_TYPE_UNKNOWN, 0},
{"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
{"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
{"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
Introduce four input errors to identify specific input failures
(in addition to generic argument value errors like EINVAL, ERANGE,
EBADF, and E2BIG).
ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #7780
2018-09-02 22:14:01 +03:00
|
|
|
ZFS_ERR_VDEV_TOO_BIG,
|
|
|
|
ZFS_ERR_IOC_CMD_UNAVAIL,
|
|
|
|
ZFS_ERR_IOC_ARG_UNAVAIL,
|
|
|
|
ZFS_ERR_IOC_ARG_REQUIRED,
|
|
|
|
ZFS_ERR_IOC_ARG_BADTYPE,
|
2019-02-09 02:44:15 +03:00
|
|
|
ZFS_ERR_WRONG_PARENT,
|
2019-02-04 22:24:55 +03:00
|
|
|
ZFS_ERR_FROM_IVSET_GUID_MISSING,
|
|
|
|
ZFS_ERR_FROM_IVSET_GUID_MISMATCH,
|
2019-05-08 01:18:44 +03:00
|
|
|
ZFS_ERR_SPILL_BLOCK_FLAG_MISSING,
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE,
|
2019-07-18 23:02:33 +03:00
|
|
|
ZFS_ERR_EXPORT_IN_PROGRESS,
|
2019-11-11 10:24:14 +03:00
|
|
|
ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR,
|
2020-03-17 20:30:33 +03:00
|
|
|
ZFS_ERR_STREAM_TRUNCATED,
|
File incorrectly zeroed when receiving incremental stream that toggles -L
Background:
By increasing the recordsize property above the default of 128KB, a
filesystem may have "large" blocks. By default, a send stream of such a
filesystem does not contain large WRITE records, instead it decreases
objects' block sizes to 128KB and splits the large blocks into 128KB
blocks, allowing the large-block filesystem to be received by a system
that does not support the `large_blocks` feature. A send stream
generated by `zfs send -L` (or `--large-block`) preserves the large
block size on the receiving system, by using large WRITE records.
When receiving an incremental send stream for a filesystem with large
blocks, if the send stream's -L flag was toggled, a bug is encountered
in which the file's contents are incorrectly zeroed out. The contents
of any blocks that were not modified by this send stream will be lost.
"Toggled" means that the previous send used `-L`, but this incremental
does not use `-L` (-L to no-L); or that the previous send did not use
`-L`, but this incremental does use `-L` (no-L to -L).
Changes:
This commit addresses the problem with several changes to the semantics
of zfs send/receive:
1. "-L to no-L" incrementals are rejected. If the previous send used
`-L`, but this incremental does not use `-L`, the `zfs receive` will
fail with this error message:
incremental send stream requires -L (--large-block), to match
previous receive.
2. "no-L to -L" incrementals are handled correctly, preserving the
smaller (128KB) block size of any already-received files that used large
blocks on the sending system but were split by `zfs send` without the
`-L` flag.
3. A new send stream format flag is added, `SWITCH_TO_LARGE_BLOCKS`.
This feature indicates that we can correctly handle "no-L to -L"
incrementals. This flag is currently not set on any send streams. In
the future, we intend for incremental send streams of snapshots that
have large blocks to use `-L` by default, and these streams will also
have the `SWITCH_TO_LARGE_BLOCKS` feature set. This ensures that streams
from the default use of `zfs send` won't encounter the bug mentioned
above, because they can't be received by software with the bug.
Implementation notes:
To facilitate accessing the ZPL's generation number,
`zfs_space_delta_cb()` has been renamed to `zpl_get_file_info()` and
restructured to fill in a struct with ZPL-specific info including owner
and generation.
In the "no-L to -L" case, if this is a compressed send stream (from
`zfs send -cL`), large WRITE records that are being written to small
(128KB) blocksize files need to be decompressed so that they can be
written split up into multiple blocks. The zio pipeline will recompress
each smaller block individually.
A new test case, `send-L_toggle`, is added, which tests the "no-L to -L"
case and verifies that we get an error for the "-L to no-L" case.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #6224
Closes #10383
2020-06-09 20:41:01 +03:00
|
|
|
ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH,
|
2020-07-03 21:05:50 +03:00
|
|
|
ZFS_ERR_RESILVER_IN_PROGRESS,
|
|
|
|
ZFS_ERR_REBUILD_IN_PROGRESS,
|
2020-08-01 18:41:31 +03:00
|
|
|
ZFS_ERR_BADPROP,
|
2021-11-30 17:46:25 +03:00
|
|
|
ZFS_ERR_VDEV_NOTSUP,
|
2021-02-21 19:19:43 +03:00
|
|
|
ZFS_ERR_NOT_USER_NAMESPACE,
|
2022-09-28 02:34:27 +03:00
|
|
|
ZFS_ERR_RESUME_EXISTS,
|
Better handling for future crypto parameters
The intent is that this is like ENOTSUP, but specifically for when
something can't be done because we have no support for the requested
crypto parameters; eg unlocking a dataset or receiving a stream
encrypted with a suite we don't support.
Its not intended to be recoverable without upgrading ZFS itself.
If the request could be made to work by enabling a feature or modifying
some other configuration item, then some other code should be used.
load-key: In the future we might have more crypto suites (ie new values
for the `encryption` property. Right now trying to load a key on such
a future crypto suite will look up suite parameters off the end of the
crypto table, resulting in misbehaviour and/or crashes (or, with debug
enabled, trip the assertion in `zio_crypt_key_unwrap`).
Instead, lets check the value we got from the dataset, and if we can't
handle it, abort early.
recv: When receiving a raw stream encrypted with an unknown crypto
suite, `zfs recv` would report a generic `invalid backup stream`
(EINVAL). While technically correct, its not super helpful, so lets
ship a more specific error code and message.
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #14577
2023-03-08 01:05:14 +03:00
|
|
|
ZFS_ERR_CRYPTO_NOTSUP,
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
|
2024-03-29 22:15:56 +03:00
|
|
|
ZFS_ERR_ASHIFT_MISMATCH,
|
feature: large_microzap
In a4b21eadec we added the zap_micro_max_size tuneable to raise the size
at which "micro" (single-block) ZAPs are upgraded to "fat" (multi-block)
ZAPs. Before this, a microZAP was limited to 128KiB, which was the old
largest block size. The side effect of raising the max size past 128KiB
is that it be stored in a large block, requiring the large_blocks
feature.
Unfortunately, this means that a backup stream created without the
--large-block (-L) flag to zfs send would split the microZAP block into
smaller blocks and send those, as is normal behaviour for large blocks.
This would be received correctly, but since microZAPs are limited to the
first block in the object by definition, the entries in the later blocks
would be inaccessible. For directory ZAPs, this gives the appearance of
files being lost.
This commit adds a feature flag, large_microzap, that must be enabled
for microZAPs to grow beyond 128KiB, and which will be activated the
first time that occurs. This feature is later checked when generating
the stream and if active, the send operation will abort unless
--large-block has also been requested.
Changing the limit still requires zap_micro_max_size to be changed. The
state of this flag effectively sets the upper value for this tuneable,
that is, if the feature is disabled, the tuneable will be clamped to
128KiB.
A stream flag is also added to ensure that the receiver also activates
its own feature flag upon receiving the stream. This is not strictly
necessary to _use_ the received microZAP, since it doesn't care how
large its block is, but it is required to send the microZAP object on,
otherwise the original problem occurs again.
Because it's difficult to reliably distinguish a microZAP from a fatZAP
from outside the ZAP code, and because it seems unlikely that most
users are affected (a fairly niche tuneable combined with what should be
an uncommon use of send), and for the sake of expediency, this change
activates the feature the first time a microZAP grows to use a large
block, and is never deactivated after that. This can be improved in the
future.
This commit changes nothing for existing pools that already have large
microZAPs. The feature will not be retroactively applied, but will be
activated the next time a microZAP grows past the limit.
Don't use large_blocks feature for enable/disable tests. The
large_microzap depends on large_blocks, so it gets enabled as a
dependency, breaking the test. Instead use feature "longname", which has
the exact same feature characteristics.
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16593
2024-10-03 06:47:11 +03:00
|
|
|
ZFS_ERR_STREAM_LARGE_MICROZAP,
|
2016-12-17 01:11:29 +03:00
|
|
|
} zfs_errno_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Internal SPA load state. Used by FMA diagnosis engine.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
2010-05-29 00:45:14 +04:00
|
|
|
SPA_LOAD_NONE, /* no load in progress */
|
|
|
|
SPA_LOAD_OPEN, /* normal open */
|
|
|
|
SPA_LOAD_IMPORT, /* import in progress */
|
|
|
|
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
|
|
|
|
SPA_LOAD_RECOVER, /* recovery requested */
|
2016-10-14 03:59:18 +03:00
|
|
|
SPA_LOAD_ERROR, /* load failed */
|
|
|
|
SPA_LOAD_CREATE /* creation in progress */
|
2008-11-20 23:01:55 +03:00
|
|
|
} spa_load_state_t;
|
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
typedef enum {
|
|
|
|
ZPOOL_WAIT_CKPT_DISCARD,
|
|
|
|
ZPOOL_WAIT_FREE,
|
|
|
|
ZPOOL_WAIT_INITIALIZE,
|
|
|
|
ZPOOL_WAIT_REPLACE,
|
|
|
|
ZPOOL_WAIT_REMOVE,
|
|
|
|
ZPOOL_WAIT_RESILVER,
|
|
|
|
ZPOOL_WAIT_SCRUB,
|
2020-03-05 02:07:11 +03:00
|
|
|
ZPOOL_WAIT_TRIM,
|
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group,
expanding its capacity incrementally. This feature is especially useful
for small pools (typically with only one RAID-Z group), where there
isn't sufficient hardware to add capacity by adding a whole new RAID-Z
group (typically doubling the number of disks).
== Initiating expansion ==
A new device (disk) can be attached to an existing RAIDZ vdev, by
running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank
raidz2-0 sda`. The new device will become part of the RAIDZ group. A
"raidz expansion" will be initiated, and the new device will contribute
additional space to the RAIDZ group once the expansion completes.
The `feature@raidz_expansion` on-disk feature flag must be `enabled` to
initiate an expansion, and it remains `active` for the life of the pool.
In other words, pools with expanded RAIDZ vdevs can not be imported by
older releases of the ZFS software.
== During expansion ==
The expansion entails reading all allocated space from existing disks in
the RAIDZ group, and rewriting it to the new disks in the RAIDZ group
(including the newly added device).
The expansion progress can be monitored with `zpool status`.
Data redundancy is maintained during (and after) the expansion. If a
disk fails while the expansion is in progress, the expansion pauses
until the health of the RAIDZ vdev is restored (e.g. by replacing the
failed disk and waiting for reconstruction to complete).
The pool remains accessible during expansion. Following a reboot or
export/import, the expansion resumes where it left off.
== After expansion ==
When the expansion completes, the additional space is available for use,
and is reflected in the `available` zfs property (as seen in `zfs list`,
`df`, etc).
Expansion does not change the number of failures that can be tolerated
without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after
expansion).
A RAIDZ vdev can be expanded multiple times.
After the expansion completes, old blocks remain with their old
data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but
distributed among the larger set of disks. New blocks will be written
with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been
expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ
vdev's "assumed parity ratio" does not change, so slightly less space
than is expected may be reported for newly-written blocks, according to
`zfs list`, `df`, `ls -s`, and similar tools.
Sponsored-by: The FreeBSD Foundation
Sponsored-by: iXsystems, Inc.
Sponsored-by: vStack
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Authored-by: Matthew Ahrens <mahrens@delphix.com>
Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com>
Contributions-by: Stuart Maybee <stuart.maybee@comcast.net>
Contributions-by: Thorsten Behrens <tbehrens@outlook.com>
Contributions-by: Fmstrat <nospam@nowsci.com>
Contributions-by: Don Brady <dev.fs.zfs@gmail.com>
Signed-off-by: Don Brady <dev.fs.zfs@gmail.com>
Closes #15022
2023-11-08 21:19:41 +03:00
|
|
|
ZPOOL_WAIT_RAIDZ_EXPAND,
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
ZPOOL_WAIT_NUM_ACTIVITIES
|
|
|
|
} zpool_wait_activity_t;
|
|
|
|
|
2020-04-01 20:02:06 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_WAIT_DELETEQ,
|
|
|
|
ZFS_WAIT_NUM_ACTIVITIES
|
|
|
|
} zfs_wait_activity_t;
|
|
|
|
|
2024-07-26 19:16:18 +03:00
|
|
|
typedef enum {
|
|
|
|
ZPOOL_PREFETCH_NONE = 0,
|
|
|
|
ZPOOL_PREFETCH_DDT
|
|
|
|
} zpool_prefetch_type_t;
|
|
|
|
|
2024-06-18 01:35:18 +03:00
|
|
|
typedef enum {
|
|
|
|
ZPOOL_DDT_PRUNE_NONE,
|
|
|
|
ZPOOL_DDT_PRUNE_AGE, /* in seconds */
|
|
|
|
ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */
|
|
|
|
} zpool_ddt_prune_unit_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Bookmark name values.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_ERR_LIST "error list"
|
|
|
|
#define ZPOOL_ERR_DATASET "dataset"
|
|
|
|
#define ZPOOL_ERR_OBJECT "object"
|
|
|
|
|
|
|
|
#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following are names used in the nvlist describing
|
|
|
|
* the pool's history log.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_HIST_RECORD "history record"
|
|
|
|
#define ZPOOL_HIST_TIME "history time"
|
|
|
|
#define ZPOOL_HIST_CMD "history command"
|
|
|
|
#define ZPOOL_HIST_WHO "history who"
|
|
|
|
#define ZPOOL_HIST_ZONE "history zone"
|
|
|
|
#define ZPOOL_HIST_HOST "history hostname"
|
|
|
|
#define ZPOOL_HIST_TXG "history txg"
|
|
|
|
#define ZPOOL_HIST_INT_EVENT "history internal event"
|
|
|
|
#define ZPOOL_HIST_INT_STR "history internal str"
|
2013-08-28 15:45:09 +04:00
|
|
|
#define ZPOOL_HIST_INT_NAME "internal_name"
|
|
|
|
#define ZPOOL_HIST_IOCTL "ioctl"
|
|
|
|
#define ZPOOL_HIST_INPUT_NVL "in_nvl"
|
|
|
|
#define ZPOOL_HIST_OUTPUT_NVL "out_nvl"
|
2020-11-14 21:17:16 +03:00
|
|
|
#define ZPOOL_HIST_OUTPUT_SIZE "out_size"
|
2013-08-28 15:45:09 +04:00
|
|
|
#define ZPOOL_HIST_DSNAME "dsname"
|
|
|
|
#define ZPOOL_HIST_DSID "dsid"
|
2018-02-08 19:16:23 +03:00
|
|
|
#define ZPOOL_HIST_ERRNO "errno"
|
2021-01-11 20:29:25 +03:00
|
|
|
#define ZPOOL_HIST_ELAPSED_NS "elapsed_ns"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
/*
|
|
|
|
* Special nvlist name that will not have its args recorded in the pool's
|
|
|
|
* history log.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_HIDDEN_ARGS "hidden_args"
|
|
|
|
|
2024-07-26 19:16:18 +03:00
|
|
|
/*
|
|
|
|
* The following is used when invoking ZFS_IOC_POOL_GET_PROPS.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_GET_PROPS_NAMES "get_props_names"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Opt-in property names used with ZPOOL_GET_PROPS_NAMES.
|
|
|
|
* For example, properties that are hidden or expensive to compute.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_DEDUPCACHED_PROP_NAME "dedupcached"
|
|
|
|
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
|
|
|
|
#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
|
|
|
|
|
2024-08-26 19:27:24 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_POOL_REGUID.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_REGUID_GUID "guid"
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_POOL_TRIM.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_TRIM_COMMAND "trim_command"
|
|
|
|
#define ZPOOL_TRIM_VDEVS "trim_vdevs"
|
|
|
|
#define ZPOOL_TRIM_RATE "trim_rate"
|
|
|
|
#define ZPOOL_TRIM_SECURE "trim_secure"
|
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_POOL_WAIT.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_WAIT_ACTIVITY "wait_activity"
|
|
|
|
#define ZPOOL_WAIT_TAG "wait_tag"
|
|
|
|
#define ZPOOL_WAIT_WAITED "wait_waited"
|
|
|
|
|
2021-11-30 17:46:25 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_VDEV_GET_PROP.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_VDEV_PROPS_GET_VDEV "vdevprops_get_vdev"
|
|
|
|
#define ZPOOL_VDEV_PROPS_GET_PROPS "vdevprops_get_props"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_VDEV_SET_PROP.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_VDEV_PROPS_SET_VDEV "vdevprops_set_vdev"
|
|
|
|
#define ZPOOL_VDEV_PROPS_SET_PROPS "vdevprops_set_props"
|
|
|
|
|
2020-04-01 20:02:06 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_WAIT_FS.
|
|
|
|
*/
|
|
|
|
#define ZFS_WAIT_ACTIVITY "wait_activity"
|
|
|
|
#define ZFS_WAIT_WAITED "wait_waited"
|
|
|
|
|
2024-07-26 19:16:18 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_POOL_PREFETCH.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_PREFETCH_TYPE "prefetch_type"
|
|
|
|
|
2024-06-18 01:35:18 +03:00
|
|
|
/*
|
|
|
|
* The following are names used when invoking ZFS_IOC_DDT_PRUNE.
|
|
|
|
*/
|
|
|
|
#define DDT_PRUNE_UNIT "ddt_prune_unit"
|
|
|
|
#define DDT_PRUNE_AMOUNT "ddt_prune_amount"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Flags for ZFS_IOC_VDEV_SET_STATE
|
|
|
|
*/
|
|
|
|
#define ZFS_ONLINE_CHECKREMOVE 0x1
|
|
|
|
#define ZFS_ONLINE_UNSPARE 0x2
|
|
|
|
#define ZFS_ONLINE_FORCEFAULT 0x4
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZFS_ONLINE_EXPAND 0x8
|
2023-01-09 23:43:03 +03:00
|
|
|
#define ZFS_ONLINE_SPARE 0x10
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZFS_OFFLINE_TEMPORARY 0x1
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* Flags for ZFS_IOC_POOL_IMPORT
|
|
|
|
*/
|
|
|
|
#define ZFS_IMPORT_NORMAL 0x0
|
|
|
|
#define ZFS_IMPORT_VERBATIM 0x1
|
|
|
|
#define ZFS_IMPORT_ANY_HOST 0x2
|
|
|
|
#define ZFS_IMPORT_MISSING_LOG 0x4
|
|
|
|
#define ZFS_IMPORT_ONLY 0x8
|
2013-07-01 18:57:04 +04:00
|
|
|
#define ZFS_IMPORT_TEMP_NAME 0x10
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
#define ZFS_IMPORT_SKIP_MMP 0x20
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#define ZFS_IMPORT_LOAD_KEYS 0x40
|
2016-12-17 01:11:29 +03:00
|
|
|
#define ZFS_IMPORT_CHECKPOINT 0x80
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2018-02-08 19:16:23 +03:00
|
|
|
/*
|
|
|
|
* Channel program argument/return nvlist keys and defaults.
|
|
|
|
*/
|
|
|
|
#define ZCP_ARG_PROGRAM "program"
|
|
|
|
#define ZCP_ARG_ARGLIST "arg"
|
2018-02-08 19:35:09 +03:00
|
|
|
#define ZCP_ARG_SYNC "sync"
|
2018-02-08 19:16:23 +03:00
|
|
|
#define ZCP_ARG_INSTRLIMIT "instrlimit"
|
|
|
|
#define ZCP_ARG_MEMLIMIT "memlimit"
|
|
|
|
|
|
|
|
#define ZCP_ARG_CLIARGV "argv"
|
|
|
|
|
|
|
|
#define ZCP_RET_ERROR "error"
|
|
|
|
#define ZCP_RET_RETURN "return"
|
|
|
|
|
|
|
|
#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000)
|
|
|
|
#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT)
|
|
|
|
#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024)
|
|
|
|
#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT)
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Sysevent payload members. ZFS will generate the following sysevents with the
|
|
|
|
* given payloads:
|
|
|
|
*
|
|
|
|
* ESC_ZFS_RESILVER_START
|
2020-07-03 21:05:50 +03:00
|
|
|
* ESC_ZFS_RESILVER_FINISH
|
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
* ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING
|
|
|
|
*
|
2008-11-20 23:01:55 +03:00
|
|
|
* ESC_ZFS_POOL_DESTROY
|
2011-11-12 02:07:54 +04:00
|
|
|
* ESC_ZFS_POOL_REGUID
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
*
|
|
|
|
* ESC_ZFS_VDEV_REMOVE
|
|
|
|
* ESC_ZFS_VDEV_CLEAR
|
|
|
|
* ESC_ZFS_VDEV_CHECK
|
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
* ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_VDEV_GUID DATA_TYPE_UINT64
|
2017-05-30 21:39:17 +03:00
|
|
|
*
|
|
|
|
* ESC_ZFS_HISTORY_EVENT
|
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
* ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional)
|
|
|
|
*
|
|
|
|
* The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
|
|
|
|
* history log nvlist. The keynames will be free of any spaces or other
|
|
|
|
* characters that could be potentially unexpected to consumers of the
|
|
|
|
* sysevents.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
#define ZFS_EV_POOL_NAME "pool_name"
|
|
|
|
#define ZFS_EV_POOL_GUID "pool_guid"
|
|
|
|
#define ZFS_EV_VDEV_PATH "vdev_path"
|
|
|
|
#define ZFS_EV_VDEV_GUID "vdev_guid"
|
2017-05-30 21:39:17 +03:00
|
|
|
#define ZFS_EV_HIST_TIME "history_time"
|
|
|
|
#define ZFS_EV_HIST_CMD "history_command"
|
|
|
|
#define ZFS_EV_HIST_WHO "history_who"
|
|
|
|
#define ZFS_EV_HIST_ZONE "history_zone"
|
|
|
|
#define ZFS_EV_HIST_HOST "history_hostname"
|
|
|
|
#define ZFS_EV_HIST_TXG "history_txg"
|
|
|
|
#define ZFS_EV_HIST_INT_EVENT "history_internal_event"
|
|
|
|
#define ZFS_EV_HIST_INT_STR "history_internal_str"
|
|
|
|
#define ZFS_EV_HIST_INT_NAME "history_internal_name"
|
|
|
|
#define ZFS_EV_HIST_IOCTL "history_ioctl"
|
|
|
|
#define ZFS_EV_HIST_DSNAME "history_dsname"
|
|
|
|
#define ZFS_EV_HIST_DSID "history_dsid"
|
2020-07-03 21:05:50 +03:00
|
|
|
#define ZFS_EV_RESILVER_TYPE "resilver_type"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2021-05-27 18:09:32 +03:00
|
|
|
/*
|
|
|
|
* We currently support block sizes from 512 bytes to 16MB.
|
|
|
|
* The benefits of larger blocks, and thus larger IO, need to be weighed
|
|
|
|
* against the cost of COWing a giant block to modify one byte, and the
|
|
|
|
* large latency of reading or writing a large block.
|
|
|
|
*
|
2022-09-16 23:52:25 +03:00
|
|
|
* The recordsize property can not be set larger than zfs_max_recordsize
|
|
|
|
* (default 16MB on 64-bit and 1MB on 32-bit). See the comment near
|
|
|
|
* zfs_max_recordsize in dsl_dataset.c for details.
|
2021-05-27 18:09:32 +03:00
|
|
|
*
|
|
|
|
* Note that although the LSIZE field of the blkptr_t can store sizes up
|
|
|
|
* to 32MB, the dnode's dn_datablkszsec can only store sizes up to
|
|
|
|
* 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
|
|
|
|
*/
|
|
|
|
#define SPA_MINBLOCKSHIFT 9
|
|
|
|
#define SPA_OLD_MAXBLOCKSHIFT 17
|
|
|
|
#define SPA_MAXBLOCKSHIFT 24
|
|
|
|
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
|
|
|
|
#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
|
|
|
|
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
|
|
|
|
|
|
|
|
/* supported encryption algorithms */
|
|
|
|
enum zio_encrypt {
|
|
|
|
ZIO_CRYPT_INHERIT = 0,
|
|
|
|
ZIO_CRYPT_ON,
|
|
|
|
ZIO_CRYPT_OFF,
|
|
|
|
ZIO_CRYPT_AES_128_CCM,
|
|
|
|
ZIO_CRYPT_AES_192_CCM,
|
|
|
|
ZIO_CRYPT_AES_256_CCM,
|
|
|
|
ZIO_CRYPT_AES_128_GCM,
|
|
|
|
ZIO_CRYPT_AES_192_GCM,
|
|
|
|
ZIO_CRYPT_AES_256_GCM,
|
|
|
|
ZIO_CRYPT_FUNCTIONS
|
|
|
|
};
|
|
|
|
|
|
|
|
#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM
|
|
|
|
#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF
|
|
|
|
|
2022-02-16 03:35:30 +03:00
|
|
|
/*
|
|
|
|
* xattr namespace prefixes. These are forbidden in xattr names.
|
|
|
|
*
|
|
|
|
* For cross-platform compatibility, xattrs in the user namespace should not be
|
|
|
|
* prefixed with the namespace name, but for backwards compatibility with older
|
|
|
|
* ZFS on Linux versions we do prefix the namespace.
|
|
|
|
*/
|
|
|
|
#define ZFS_XA_NS_FREEBSD_PREFIX "freebsd:"
|
|
|
|
#define ZFS_XA_NS_FREEBSD_PREFIX_LEN strlen("freebsd:")
|
|
|
|
#define ZFS_XA_NS_LINUX_SECURITY_PREFIX "security."
|
|
|
|
#define ZFS_XA_NS_LINUX_SECURITY_PREFIX_LEN strlen("security.")
|
|
|
|
#define ZFS_XA_NS_LINUX_SYSTEM_PREFIX "system."
|
|
|
|
#define ZFS_XA_NS_LINUX_SYSTEM_PREFIX_LEN strlen("system.")
|
|
|
|
#define ZFS_XA_NS_LINUX_TRUSTED_PREFIX "trusted."
|
|
|
|
#define ZFS_XA_NS_LINUX_TRUSTED_PREFIX_LEN strlen("trusted.")
|
|
|
|
#define ZFS_XA_NS_LINUX_USER_PREFIX "user."
|
|
|
|
#define ZFS_XA_NS_LINUX_USER_PREFIX_LEN strlen("user.")
|
|
|
|
|
|
|
|
#define ZFS_XA_NS_PREFIX_MATCH(ns, name) \
|
|
|
|
(strncmp(name, ZFS_XA_NS_##ns##_PREFIX, \
|
|
|
|
ZFS_XA_NS_##ns##_PREFIX_LEN) == 0)
|
|
|
|
|
|
|
|
#define ZFS_XA_NS_PREFIX_FORBIDDEN(name) \
|
|
|
|
(ZFS_XA_NS_PREFIX_MATCH(FREEBSD, name) || \
|
|
|
|
ZFS_XA_NS_PREFIX_MATCH(LINUX_SECURITY, name) || \
|
|
|
|
ZFS_XA_NS_PREFIX_MATCH(LINUX_SYSTEM, name) || \
|
|
|
|
ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \
|
|
|
|
ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name))
|
2021-05-27 18:09:32 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_FS_ZFS_H */
|