2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
2009-08-18 22:43:27 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2014-05-23 20:21:07 +04:00
|
|
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
2011-11-12 02:07:54 +04:00
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
2017-05-30 21:39:17 +03:00
|
|
|
* Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
|
|
|
|
* Copyright (c) 2014 Integros [integros.com]
|
2017-05-19 22:33:11 +03:00
|
|
|
* Copyright (c) 2017 Datto Inc.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/* Portions Copyright 2010 Robert Milkowski */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifndef _SYS_FS_ZFS_H
|
|
|
|
#define _SYS_FS_ZFS_H
|
|
|
|
|
2009-08-18 22:43:27 +04:00
|
|
|
#include <sys/time.h>
|
2016-02-29 21:05:23 +03:00
|
|
|
#include <sys/zio_priority.h>
|
2009-08-18 22:43:27 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Types and constants shared between userland and the kernel.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each dataset can be one of the following types. These constants can be
|
|
|
|
* combined into masks that can be passed to various functions.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
2013-12-12 02:33:41 +04:00
|
|
|
ZFS_TYPE_FILESYSTEM = (1 << 0),
|
|
|
|
ZFS_TYPE_SNAPSHOT = (1 << 1),
|
|
|
|
ZFS_TYPE_VOLUME = (1 << 2),
|
|
|
|
ZFS_TYPE_POOL = (1 << 3),
|
|
|
|
ZFS_TYPE_BOOKMARK = (1 << 4)
|
2008-11-20 23:01:55 +03:00
|
|
|
} zfs_type_t;
|
|
|
|
|
2017-01-23 20:49:57 +03:00
|
|
|
/*
|
|
|
|
* NB: lzc_dataset_type should be updated whenever a new objset type is added,
|
|
|
|
* if it represents a real type of a dataset that can be created from userland.
|
|
|
|
*/
|
2013-08-28 15:45:09 +04:00
|
|
|
typedef enum dmu_objset_type {
|
|
|
|
DMU_OST_NONE,
|
|
|
|
DMU_OST_META,
|
|
|
|
DMU_OST_ZFS,
|
|
|
|
DMU_OST_ZVOL,
|
|
|
|
DMU_OST_OTHER, /* For testing only! */
|
|
|
|
DMU_OST_ANY, /* Be careful! */
|
|
|
|
DMU_OST_NUMTYPES
|
|
|
|
} dmu_objset_type_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZFS_TYPE_DATASET \
|
|
|
|
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
|
|
|
|
|
2016-06-16 00:28:36 +03:00
|
|
|
/*
|
|
|
|
* All of these include the terminating NUL byte.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZAP_MAXNAMELEN 256
|
|
|
|
#define ZAP_MAXVALUELEN (1024 * 8)
|
|
|
|
#define ZAP_OLDMAXVALUELEN 1024
|
2016-06-16 00:28:36 +03:00
|
|
|
#define ZFS_MAX_DATASET_NAME_LEN 256
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Dataset properties are identified by these constants and must be added to
|
|
|
|
* the end of this list to ensure that external consumers are not affected
|
|
|
|
* by the change. If you make any changes to this list, be sure to update
|
2013-07-03 05:55:16 +04:00
|
|
|
* the property table in module/zcommon/zfs_prop.c.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
typedef enum {
|
2017-01-18 01:45:02 +03:00
|
|
|
ZFS_PROP_BAD = -1,
|
|
|
|
ZFS_PROP_TYPE = 0,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_CREATION,
|
|
|
|
ZFS_PROP_USED,
|
|
|
|
ZFS_PROP_AVAILABLE,
|
|
|
|
ZFS_PROP_REFERENCED,
|
|
|
|
ZFS_PROP_COMPRESSRATIO,
|
|
|
|
ZFS_PROP_MOUNTED,
|
|
|
|
ZFS_PROP_ORIGIN,
|
|
|
|
ZFS_PROP_QUOTA,
|
|
|
|
ZFS_PROP_RESERVATION,
|
|
|
|
ZFS_PROP_VOLSIZE,
|
|
|
|
ZFS_PROP_VOLBLOCKSIZE,
|
|
|
|
ZFS_PROP_RECORDSIZE,
|
|
|
|
ZFS_PROP_MOUNTPOINT,
|
|
|
|
ZFS_PROP_SHARENFS,
|
|
|
|
ZFS_PROP_CHECKSUM,
|
|
|
|
ZFS_PROP_COMPRESSION,
|
|
|
|
ZFS_PROP_ATIME,
|
|
|
|
ZFS_PROP_DEVICES,
|
|
|
|
ZFS_PROP_EXEC,
|
|
|
|
ZFS_PROP_SETUID,
|
|
|
|
ZFS_PROP_READONLY,
|
|
|
|
ZFS_PROP_ZONED,
|
|
|
|
ZFS_PROP_SNAPDIR,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_PROP_PRIVATE, /* not exposed to user, temporary */
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_ACLINHERIT,
|
2017-05-10 01:36:53 +03:00
|
|
|
ZFS_PROP_CREATETXG,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_PROP_NAME, /* not exposed to the user */
|
|
|
|
ZFS_PROP_CANMOUNT,
|
|
|
|
ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
|
|
|
|
ZFS_PROP_XATTR,
|
|
|
|
ZFS_PROP_NUMCLONES, /* not exposed to the user */
|
|
|
|
ZFS_PROP_COPIES,
|
|
|
|
ZFS_PROP_VERSION,
|
|
|
|
ZFS_PROP_UTF8ONLY,
|
|
|
|
ZFS_PROP_NORMALIZE,
|
|
|
|
ZFS_PROP_CASE,
|
|
|
|
ZFS_PROP_VSCAN,
|
|
|
|
ZFS_PROP_NBMAND,
|
|
|
|
ZFS_PROP_SHARESMB,
|
|
|
|
ZFS_PROP_REFQUOTA,
|
|
|
|
ZFS_PROP_REFRESERVATION,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZFS_PROP_GUID,
|
|
|
|
ZFS_PROP_PRIMARYCACHE,
|
|
|
|
ZFS_PROP_SECONDARYCACHE,
|
|
|
|
ZFS_PROP_USEDSNAP,
|
|
|
|
ZFS_PROP_USEDDS,
|
|
|
|
ZFS_PROP_USEDCHILD,
|
|
|
|
ZFS_PROP_USEDREFRESERV,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
|
|
|
|
ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
|
2009-08-18 22:43:27 +04:00
|
|
|
ZFS_PROP_DEFER_DESTROY,
|
|
|
|
ZFS_PROP_USERREFS,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_PROP_LOGBIAS,
|
|
|
|
ZFS_PROP_UNIQUE, /* not exposed to the user */
|
|
|
|
ZFS_PROP_OBJSETID, /* not exposed to the user */
|
|
|
|
ZFS_PROP_DEDUP,
|
|
|
|
ZFS_PROP_MLSLABEL,
|
|
|
|
ZFS_PROP_SYNC,
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
ZFS_PROP_DNODESIZE,
|
2011-07-26 23:23:00 +04:00
|
|
|
ZFS_PROP_REFRATIO,
|
2011-11-17 22:14:36 +04:00
|
|
|
ZFS_PROP_WRITTEN,
|
|
|
|
ZFS_PROP_CLONES,
|
2013-02-22 13:23:09 +04:00
|
|
|
ZFS_PROP_LOGICALUSED,
|
|
|
|
ZFS_PROP_LOGICALREFERENCED,
|
2013-07-27 21:51:50 +04:00
|
|
|
ZFS_PROP_INCONSISTENT, /* not exposed to the user */
|
2017-07-12 23:05:37 +03:00
|
|
|
ZFS_PROP_VOLMODE,
|
2015-04-01 16:07:48 +03:00
|
|
|
ZFS_PROP_FILESYSTEM_LIMIT,
|
|
|
|
ZFS_PROP_SNAPSHOT_LIMIT,
|
|
|
|
ZFS_PROP_FILESYSTEM_COUNT,
|
|
|
|
ZFS_PROP_SNAPSHOT_COUNT,
|
2013-02-14 03:11:59 +04:00
|
|
|
ZFS_PROP_SNAPDEV,
|
2013-10-28 20:22:15 +04:00
|
|
|
ZFS_PROP_ACLTYPE,
|
2013-12-19 10:24:14 +04:00
|
|
|
ZFS_PROP_SELINUX_CONTEXT,
|
|
|
|
ZFS_PROP_SELINUX_FSCONTEXT,
|
|
|
|
ZFS_PROP_SELINUX_DEFCONTEXT,
|
|
|
|
ZFS_PROP_SELINUX_ROOTCONTEXT,
|
2014-01-18 23:00:53 +04:00
|
|
|
ZFS_PROP_RELATIME,
|
2014-05-23 20:21:07 +04:00
|
|
|
ZFS_PROP_REDUNDANT_METADATA,
|
2014-07-25 14:42:00 +04:00
|
|
|
ZFS_PROP_OVERLAY,
|
2016-01-01 16:15:31 +03:00
|
|
|
ZFS_PROP_PREV_SNAP,
|
2016-01-07 00:22:48 +03:00
|
|
|
ZFS_PROP_RECEIVE_RESUME_TOKEN,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_NUM_PROPS
|
|
|
|
} zfs_prop_t;
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_PROP_USERUSED,
|
|
|
|
ZFS_PROP_USERQUOTA,
|
|
|
|
ZFS_PROP_GROUPUSED,
|
|
|
|
ZFS_PROP_GROUPQUOTA,
|
2016-10-04 21:46:10 +03:00
|
|
|
ZFS_PROP_USEROBJUSED,
|
|
|
|
ZFS_PROP_USEROBJQUOTA,
|
|
|
|
ZFS_PROP_GROUPOBJUSED,
|
|
|
|
ZFS_PROP_GROUPOBJQUOTA,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZFS_NUM_USERQUOTA_PROPS
|
|
|
|
} zfs_userquota_prop_t;
|
|
|
|
|
|
|
|
extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Pool properties are identified by these constants and must be added to the
|
2008-12-03 23:09:06 +03:00
|
|
|
* end of this list to ensure that external consumers are not affected
|
2008-11-20 23:01:55 +03:00
|
|
|
* by the change. If you make any changes to this list, be sure to update
|
2013-07-03 05:55:16 +04:00
|
|
|
* the property table in module/zcommon/zpool_prop.c.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
typedef enum {
|
|
|
|
ZPOOL_PROP_NAME,
|
|
|
|
ZPOOL_PROP_SIZE,
|
|
|
|
ZPOOL_PROP_CAPACITY,
|
|
|
|
ZPOOL_PROP_ALTROOT,
|
|
|
|
ZPOOL_PROP_HEALTH,
|
|
|
|
ZPOOL_PROP_GUID,
|
|
|
|
ZPOOL_PROP_VERSION,
|
|
|
|
ZPOOL_PROP_BOOTFS,
|
|
|
|
ZPOOL_PROP_DELEGATION,
|
|
|
|
ZPOOL_PROP_AUTOREPLACE,
|
|
|
|
ZPOOL_PROP_CACHEFILE,
|
|
|
|
ZPOOL_PROP_FAILUREMODE,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZPOOL_PROP_LISTSNAPS,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZPOOL_PROP_AUTOEXPAND,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZPOOL_PROP_DEDUPDITTO,
|
|
|
|
ZPOOL_PROP_DEDUPRATIO,
|
|
|
|
ZPOOL_PROP_FREE,
|
|
|
|
ZPOOL_PROP_ALLOCATED,
|
2010-08-27 01:24:34 +04:00
|
|
|
ZPOOL_PROP_READONLY,
|
2011-06-16 23:56:38 +04:00
|
|
|
ZPOOL_PROP_ASHIFT,
|
2011-11-15 23:01:27 +04:00
|
|
|
ZPOOL_PROP_COMMENT,
|
2012-01-24 06:43:32 +04:00
|
|
|
ZPOOL_PROP_EXPANDSZ,
|
2012-12-14 03:24:15 +04:00
|
|
|
ZPOOL_PROP_FREEING,
|
2014-07-20 00:19:24 +04:00
|
|
|
ZPOOL_PROP_FRAGMENTATION,
|
2014-06-06 01:20:08 +04:00
|
|
|
ZPOOL_PROP_LEAKED,
|
2014-11-03 23:15:08 +03:00
|
|
|
ZPOOL_PROP_MAXBLOCKSIZE,
|
2014-06-21 03:00:11 +04:00
|
|
|
ZPOOL_PROP_TNAME,
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
ZPOOL_PROP_MAXDNODESIZE,
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
ZPOOL_PROP_MULTIHOST,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZPOOL_NUM_PROPS
|
|
|
|
} zpool_prop_t;
|
|
|
|
|
2011-11-15 23:01:27 +04:00
|
|
|
/* Small enough to not hog a whole line of printout in zpool(1M). */
|
|
|
|
#define ZPROP_MAX_COMMENT 32
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPROP_CONT -2
|
|
|
|
#define ZPROP_INVAL -1
|
|
|
|
|
|
|
|
#define ZPROP_VALUE "value"
|
|
|
|
#define ZPROP_SOURCE "source"
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
ZPROP_SRC_NONE = 0x1,
|
|
|
|
ZPROP_SRC_DEFAULT = 0x2,
|
|
|
|
ZPROP_SRC_TEMPORARY = 0x4,
|
|
|
|
ZPROP_SRC_LOCAL = 0x8,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZPROP_SRC_INHERITED = 0x10,
|
|
|
|
ZPROP_SRC_RECEIVED = 0x20
|
2008-11-20 23:01:55 +03:00
|
|
|
} zprop_source_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPROP_SRC_ALL 0x3f
|
|
|
|
|
|
|
|
#define ZPROP_SOURCE_VAL_RECVD "$recvd"
|
|
|
|
#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS"
|
2016-06-10 03:04:12 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Dataset flag implemented as a special entry in the props zap object
|
|
|
|
* indicating that the dataset has received properties on or after
|
|
|
|
* SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
|
|
|
|
* just as it did in earlier versions, and thereafter, local properties are
|
|
|
|
* preserved.
|
|
|
|
*/
|
|
|
|
#define ZPROP_HAS_RECVD "$hasrecvd"
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
|
|
|
|
ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
|
|
|
|
} zprop_errflags_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef int (*zprop_func)(int, void *);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Properties to be set on the root file system of a new pool
|
|
|
|
* are stuffed into their own nvlist, which is then included in
|
|
|
|
* the properties nvlist with the pool properties.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_ROOTFS_PROPS "root-props-nvl"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Dataset property functions shared between libzfs and kernel.
|
|
|
|
*/
|
|
|
|
const char *zfs_prop_default_string(zfs_prop_t);
|
|
|
|
uint64_t zfs_prop_default_numeric(zfs_prop_t);
|
|
|
|
boolean_t zfs_prop_readonly(zfs_prop_t);
|
|
|
|
boolean_t zfs_prop_inheritable(zfs_prop_t);
|
|
|
|
boolean_t zfs_prop_setonce(zfs_prop_t);
|
|
|
|
const char *zfs_prop_to_name(zfs_prop_t);
|
|
|
|
zfs_prop_t zfs_name_to_prop(const char *);
|
|
|
|
boolean_t zfs_prop_user(const char *);
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t zfs_prop_userquota(const char *);
|
2011-11-17 22:14:36 +04:00
|
|
|
boolean_t zfs_prop_written(const char *);
|
2008-11-20 23:01:55 +03:00
|
|
|
int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
|
|
|
|
int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
|
2014-04-21 22:22:08 +04:00
|
|
|
boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Pool property functions shared between libzfs and kernel.
|
|
|
|
*/
|
|
|
|
zpool_prop_t zpool_name_to_prop(const char *);
|
|
|
|
const char *zpool_prop_to_name(zpool_prop_t);
|
|
|
|
const char *zpool_prop_default_string(zpool_prop_t);
|
|
|
|
uint64_t zpool_prop_default_numeric(zpool_prop_t);
|
|
|
|
boolean_t zpool_prop_readonly(zpool_prop_t);
|
2012-12-14 03:24:15 +04:00
|
|
|
boolean_t zpool_prop_feature(const char *);
|
|
|
|
boolean_t zpool_prop_unsupported(const char *);
|
2008-11-20 23:01:55 +03:00
|
|
|
int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
|
|
|
|
int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Definitions for the Delegation.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
|
|
|
ZFS_DELEG_WHO_UNKNOWN = 0,
|
|
|
|
ZFS_DELEG_USER = 'u',
|
|
|
|
ZFS_DELEG_USER_SETS = 'U',
|
|
|
|
ZFS_DELEG_GROUP = 'g',
|
|
|
|
ZFS_DELEG_GROUP_SETS = 'G',
|
|
|
|
ZFS_DELEG_EVERYONE = 'e',
|
|
|
|
ZFS_DELEG_EVERYONE_SETS = 'E',
|
|
|
|
ZFS_DELEG_CREATE = 'c',
|
|
|
|
ZFS_DELEG_CREATE_SETS = 'C',
|
|
|
|
ZFS_DELEG_NAMED_SET = 's',
|
|
|
|
ZFS_DELEG_NAMED_SET_SETS = 'S'
|
|
|
|
} zfs_deleg_who_type_t;
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
ZFS_DELEG_NONE = 0,
|
|
|
|
ZFS_DELEG_PERM_LOCAL = 1,
|
|
|
|
ZFS_DELEG_PERM_DESCENDENT = 2,
|
|
|
|
ZFS_DELEG_PERM_LOCALDESCENDENT = 3,
|
|
|
|
ZFS_DELEG_PERM_CREATE = 4
|
|
|
|
} zfs_deleg_inherit_t;
|
|
|
|
|
|
|
|
#define ZFS_DELEG_PERM_UID "uid"
|
|
|
|
#define ZFS_DELEG_PERM_GID "gid"
|
|
|
|
#define ZFS_DELEG_PERM_GROUPS "groups"
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZFS_MLSLABEL_DEFAULT "none"
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZFS_SMB_ACL_SRC "src"
|
|
|
|
#define ZFS_SMB_ACL_TARGET "target"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_CANMOUNT_OFF = 0,
|
|
|
|
ZFS_CANMOUNT_ON = 1,
|
|
|
|
ZFS_CANMOUNT_NOAUTO = 2
|
|
|
|
} zfs_canmount_type_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_LOGBIAS_LATENCY = 0,
|
|
|
|
ZFS_LOGBIAS_THROUGHPUT = 1
|
|
|
|
} zfs_logbias_op_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef enum zfs_share_op {
|
|
|
|
ZFS_SHARE_NFS = 0,
|
|
|
|
ZFS_UNSHARE_NFS = 1,
|
|
|
|
ZFS_SHARE_SMB = 2,
|
|
|
|
ZFS_UNSHARE_SMB = 3
|
|
|
|
} zfs_share_op_t;
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
typedef enum zfs_smb_acl_op {
|
|
|
|
ZFS_SMB_ACL_ADD,
|
|
|
|
ZFS_SMB_ACL_REMOVE,
|
|
|
|
ZFS_SMB_ACL_RENAME,
|
|
|
|
ZFS_SMB_ACL_PURGE
|
|
|
|
} zfs_smb_acl_op_t;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
typedef enum zfs_cache_type {
|
|
|
|
ZFS_CACHE_NONE = 0,
|
|
|
|
ZFS_CACHE_METADATA = 1,
|
|
|
|
ZFS_CACHE_ALL = 2
|
|
|
|
} zfs_cache_type_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_SYNC_STANDARD = 0,
|
|
|
|
ZFS_SYNC_ALWAYS = 1,
|
|
|
|
ZFS_SYNC_DISABLED = 2
|
|
|
|
} zfs_sync_type_t;
|
|
|
|
|
2011-10-25 03:55:20 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_XATTR_OFF = 0,
|
|
|
|
ZFS_XATTR_DIR = 1,
|
|
|
|
ZFS_XATTR_SA = 2
|
|
|
|
} zfs_xattr_type_t;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_DNSIZE_LEGACY = 0,
|
|
|
|
ZFS_DNSIZE_AUTO = 1,
|
|
|
|
ZFS_DNSIZE_1K = 1024,
|
|
|
|
ZFS_DNSIZE_2K = 2048,
|
|
|
|
ZFS_DNSIZE_4K = 4096,
|
|
|
|
ZFS_DNSIZE_8K = 8192,
|
|
|
|
ZFS_DNSIZE_16K = 16384
|
|
|
|
} zfs_dnsize_type_t;
|
|
|
|
|
2014-05-23 20:21:07 +04:00
|
|
|
typedef enum {
|
|
|
|
ZFS_REDUNDANT_METADATA_ALL,
|
|
|
|
ZFS_REDUNDANT_METADATA_MOST
|
|
|
|
} zfs_redundant_metadata_type_t;
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
typedef enum {
|
|
|
|
ZFS_VOLMODE_DEFAULT = 0,
|
|
|
|
ZFS_VOLMODE_GEOM = 1,
|
|
|
|
ZFS_VOLMODE_DEV = 2,
|
|
|
|
ZFS_VOLMODE_NONE = 3
|
|
|
|
} zfs_volmode_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* On-disk version number.
|
|
|
|
*/
|
|
|
|
#define SPA_VERSION_1 1ULL
|
|
|
|
#define SPA_VERSION_2 2ULL
|
|
|
|
#define SPA_VERSION_3 3ULL
|
|
|
|
#define SPA_VERSION_4 4ULL
|
|
|
|
#define SPA_VERSION_5 5ULL
|
|
|
|
#define SPA_VERSION_6 6ULL
|
|
|
|
#define SPA_VERSION_7 7ULL
|
|
|
|
#define SPA_VERSION_8 8ULL
|
|
|
|
#define SPA_VERSION_9 9ULL
|
|
|
|
#define SPA_VERSION_10 10ULL
|
2008-12-03 23:09:06 +03:00
|
|
|
#define SPA_VERSION_11 11ULL
|
|
|
|
#define SPA_VERSION_12 12ULL
|
|
|
|
#define SPA_VERSION_13 13ULL
|
|
|
|
#define SPA_VERSION_14 14ULL
|
2009-07-03 02:44:48 +04:00
|
|
|
#define SPA_VERSION_15 15ULL
|
|
|
|
#define SPA_VERSION_16 16ULL
|
2009-08-18 22:43:27 +04:00
|
|
|
#define SPA_VERSION_17 17ULL
|
|
|
|
#define SPA_VERSION_18 18ULL
|
2010-05-29 00:45:14 +04:00
|
|
|
#define SPA_VERSION_19 19ULL
|
|
|
|
#define SPA_VERSION_20 20ULL
|
|
|
|
#define SPA_VERSION_21 21ULL
|
|
|
|
#define SPA_VERSION_22 22ULL
|
|
|
|
#define SPA_VERSION_23 23ULL
|
|
|
|
#define SPA_VERSION_24 24ULL
|
|
|
|
#define SPA_VERSION_25 25ULL
|
|
|
|
#define SPA_VERSION_26 26ULL
|
2010-08-27 01:24:34 +04:00
|
|
|
#define SPA_VERSION_27 27ULL
|
|
|
|
#define SPA_VERSION_28 28ULL
|
2012-12-14 03:24:15 +04:00
|
|
|
#define SPA_VERSION_5000 5000ULL
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
|
2009-07-03 02:44:48 +04:00
|
|
|
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
|
|
|
|
* and do the appropriate changes. Also bump the version number in
|
|
|
|
* usr/src/grub/capability.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2012-12-14 03:24:15 +04:00
|
|
|
#define SPA_VERSION SPA_VERSION_5000
|
|
|
|
#define SPA_VERSION_STRING "5000"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Symbolic names for the changes that caused a SPA_VERSION switch.
|
|
|
|
* Used in the code when checking for presence or absence of a feature.
|
|
|
|
* Feel free to define multiple symbolic names for each version if there
|
|
|
|
* were multiple changes to on-disk structures during that version.
|
|
|
|
*
|
|
|
|
* NOTE: When checking the current SPA_VERSION in your code, be sure
|
|
|
|
* to use spa_version() since it reports the version of the
|
|
|
|
* last synced uberblock. Checking the in-flight version can
|
|
|
|
* be dangerous in some cases.
|
|
|
|
*/
|
|
|
|
#define SPA_VERSION_INITIAL SPA_VERSION_1
|
|
|
|
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
|
|
|
|
#define SPA_VERSION_SPARES SPA_VERSION_3
|
2009-08-18 22:43:27 +04:00
|
|
|
#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
|
2010-05-29 00:45:14 +04:00
|
|
|
#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
|
2008-11-20 23:01:55 +03:00
|
|
|
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
|
|
|
|
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
|
|
|
|
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
|
|
|
|
#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5
|
|
|
|
#define SPA_VERSION_BOOTFS SPA_VERSION_6
|
|
|
|
#define SPA_VERSION_SLOGS SPA_VERSION_7
|
|
|
|
#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8
|
|
|
|
#define SPA_VERSION_FUID SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_REFQUOTA SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
|
|
|
|
#define SPA_VERSION_L2CACHE SPA_VERSION_10
|
2008-12-03 23:09:06 +03:00
|
|
|
#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11
|
|
|
|
#define SPA_VERSION_ORIGIN SPA_VERSION_11
|
|
|
|
#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11
|
|
|
|
#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
|
|
|
|
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
|
|
|
|
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
|
2009-07-03 02:44:48 +04:00
|
|
|
#define SPA_VERSION_USERSPACE SPA_VERSION_15
|
|
|
|
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
|
2009-08-18 22:43:27 +04:00
|
|
|
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
|
|
|
|
#define SPA_VERSION_USERREFS SPA_VERSION_18
|
2010-05-29 00:45:14 +04:00
|
|
|
#define SPA_VERSION_HOLES SPA_VERSION_19
|
|
|
|
#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
|
|
|
|
#define SPA_VERSION_DEDUP SPA_VERSION_21
|
|
|
|
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
|
|
|
|
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
|
|
|
|
#define SPA_VERSION_SA SPA_VERSION_24
|
|
|
|
#define SPA_VERSION_SCAN SPA_VERSION_25
|
|
|
|
#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
|
|
|
|
#define SPA_VERSION_DEADLISTS SPA_VERSION_26
|
2010-08-27 01:24:34 +04:00
|
|
|
#define SPA_VERSION_FAST_SNAP SPA_VERSION_27
|
|
|
|
#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28
|
2012-12-14 03:24:15 +04:00
|
|
|
#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28
|
|
|
|
#define SPA_VERSION_FEATURES SPA_VERSION_5000
|
|
|
|
|
|
|
|
#define SPA_VERSION_IS_SUPPORTED(v) \
|
|
|
|
(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
|
|
|
|
((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ZPL version - rev'd whenever an incompatible on-disk format change
|
|
|
|
* occurs. This is independent of SPA/DMU/ZAP versioning. You must
|
|
|
|
* also update the version_table[] and help message in zfs_prop.c.
|
|
|
|
*
|
|
|
|
* When changing, be sure to teach GRUB how to read the new format!
|
2009-07-03 02:44:48 +04:00
|
|
|
* See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
#define ZPL_VERSION_1 1ULL
|
|
|
|
#define ZPL_VERSION_2 2ULL
|
|
|
|
#define ZPL_VERSION_3 3ULL
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZPL_VERSION_4 4ULL
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPL_VERSION_5 5ULL
|
|
|
|
#define ZPL_VERSION ZPL_VERSION_5
|
|
|
|
#define ZPL_VERSION_STRING "5"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#define ZPL_VERSION_INITIAL ZPL_VERSION_1
|
|
|
|
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
|
|
|
|
#define ZPL_VERSION_FUID ZPL_VERSION_3
|
|
|
|
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
|
|
|
|
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPL_VERSION_SA ZPL_VERSION_5
|
|
|
|
|
|
|
|
/* Rewind request information */
|
|
|
|
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
|
|
|
|
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
|
|
|
|
#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */
|
|
|
|
#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */
|
|
|
|
#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */
|
|
|
|
#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */
|
|
|
|
#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */
|
|
|
|
|
|
|
|
typedef struct zpool_rewind_policy {
|
|
|
|
uint32_t zrp_request; /* rewind behavior requested */
|
|
|
|
uint64_t zrp_maxmeta; /* max acceptable meta-data errors */
|
|
|
|
uint64_t zrp_maxdata; /* max acceptable data errors */
|
|
|
|
uint64_t zrp_txg; /* specific txg to load */
|
|
|
|
} zpool_rewind_policy_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following are configuration names used in the nvlist describing a pool's
|
|
|
|
* configuration.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_CONFIG_VERSION "version"
|
|
|
|
#define ZPOOL_CONFIG_POOL_NAME "name"
|
|
|
|
#define ZPOOL_CONFIG_POOL_STATE "state"
|
|
|
|
#define ZPOOL_CONFIG_POOL_TXG "txg"
|
|
|
|
#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
|
|
|
|
#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
|
|
|
|
#define ZPOOL_CONFIG_TOP_GUID "top_guid"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
|
|
|
|
#define ZPOOL_CONFIG_TYPE "type"
|
|
|
|
#define ZPOOL_CONFIG_CHILDREN "children"
|
|
|
|
#define ZPOOL_CONFIG_ID "id"
|
|
|
|
#define ZPOOL_CONFIG_GUID "guid"
|
|
|
|
#define ZPOOL_CONFIG_PATH "path"
|
|
|
|
#define ZPOOL_CONFIG_DEVID "devid"
|
|
|
|
#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
|
|
|
|
#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
|
|
|
|
#define ZPOOL_CONFIG_ASHIFT "ashift"
|
|
|
|
#define ZPOOL_CONFIG_ASIZE "asize"
|
|
|
|
#define ZPOOL_CONFIG_DTL "DTL"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* container nvlist of extended stats */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex"
|
|
|
|
|
|
|
|
/* Active queue read/write stats */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue"
|
|
|
|
|
|
|
|
/* Queue sizes */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue"
|
|
|
|
|
|
|
|
/* Latency read/write histogram stats */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo"
|
|
|
|
|
2016-05-26 00:21:35 +03:00
|
|
|
/* Request size histograms */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
|
2016-02-29 21:05:23 +03:00
|
|
|
|
2016-10-24 20:45:59 +03:00
|
|
|
/* vdev enclosure sysfs path */
|
|
|
|
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
|
|
|
|
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
|
|
|
|
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
|
|
|
|
#define ZPOOL_CONFIG_SPARES "spares"
|
|
|
|
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
|
|
|
|
#define ZPOOL_CONFIG_NPARITY "nparity"
|
|
|
|
#define ZPOOL_CONFIG_HOSTID "hostid"
|
|
|
|
#define ZPOOL_CONFIG_HOSTNAME "hostname"
|
2010-08-27 01:24:34 +04:00
|
|
|
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_UNSPARE "unspare"
|
|
|
|
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
|
|
|
|
#define ZPOOL_CONFIG_IS_LOG "is_log"
|
|
|
|
#define ZPOOL_CONFIG_L2CACHE "l2cache"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
|
|
|
|
#define ZPOOL_CONFIG_IS_HOLE "is_hole"
|
|
|
|
#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram"
|
|
|
|
#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats"
|
|
|
|
#define ZPOOL_CONFIG_DDT_STATS "ddt_stats"
|
|
|
|
#define ZPOOL_CONFIG_SPLIT "splitcfg"
|
|
|
|
#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
|
|
|
|
#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
|
|
|
|
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
|
|
|
|
#define ZPOOL_CONFIG_REMOVING "removing"
|
2013-08-08 00:16:22 +04:00
|
|
|
#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
|
2011-11-15 23:01:27 +04:00
|
|
|
#define ZPOOL_CONFIG_COMMENT "comment"
|
2008-12-03 23:09:06 +03:00
|
|
|
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
|
2010-08-27 01:24:34 +04:00
|
|
|
#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */
|
2012-12-14 03:24:15 +04:00
|
|
|
#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */
|
2012-12-15 03:00:45 +04:00
|
|
|
#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */
|
2012-12-14 03:24:15 +04:00
|
|
|
#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
|
|
|
|
#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */
|
2014-02-21 07:57:17 +04:00
|
|
|
#define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */
|
2016-04-11 23:16:57 +03:00
|
|
|
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
|
|
|
|
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
|
|
|
|
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
|
|
|
|
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* The persistent vdev state is stored as separate values rather than a single
|
|
|
|
* 'vdev_state' entry. This is because a device can be in multiple states, such
|
|
|
|
* as offline and degraded.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_CONFIG_OFFLINE "offline"
|
|
|
|
#define ZPOOL_CONFIG_FAULTED "faulted"
|
|
|
|
#define ZPOOL_CONFIG_DEGRADED "degraded"
|
|
|
|
#define ZPOOL_CONFIG_REMOVED "removed"
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZPOOL_CONFIG_FRU "fru"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZPOOL_CONFIG_AUX_STATE "aux_state"
|
|
|
|
|
|
|
|
/* Rewind policy parameters */
|
|
|
|
#define ZPOOL_REWIND_POLICY "rewind-policy"
|
|
|
|
#define ZPOOL_REWIND_REQUEST "rewind-request"
|
|
|
|
#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg"
|
|
|
|
#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh"
|
|
|
|
#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh"
|
|
|
|
|
|
|
|
/* Rewind data discovered */
|
|
|
|
#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts"
|
|
|
|
#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
|
|
|
|
#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#define VDEV_TYPE_ROOT "root"
|
|
|
|
#define VDEV_TYPE_MIRROR "mirror"
|
|
|
|
#define VDEV_TYPE_REPLACING "replacing"
|
|
|
|
#define VDEV_TYPE_RAIDZ "raidz"
|
|
|
|
#define VDEV_TYPE_DISK "disk"
|
|
|
|
#define VDEV_TYPE_FILE "file"
|
|
|
|
#define VDEV_TYPE_MISSING "missing"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define VDEV_TYPE_HOLE "hole"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define VDEV_TYPE_SPARE "spare"
|
|
|
|
#define VDEV_TYPE_LOG "log"
|
|
|
|
#define VDEV_TYPE_L2CACHE "l2cache"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is needed in userland to report the minimum necessary device size.
|
|
|
|
*/
|
|
|
|
#define SPA_MINDEVSIZE (64ULL << 20)
|
|
|
|
|
2014-07-20 00:19:24 +04:00
|
|
|
/*
|
|
|
|
* Set if the fragmentation has not yet been calculated. This can happen
|
|
|
|
* because the space maps have not been upgraded or the histogram feature
|
|
|
|
* is not enabled.
|
|
|
|
*/
|
|
|
|
#define ZFS_FRAG_INVALID UINT64_MAX
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* The location of the pool configuration repository, shared between kernel and
|
|
|
|
* userland.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
#define ZPOOL_CACHE "/etc/zfs/zpool.cache"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* vdev states are ordered from least to most healthy.
|
|
|
|
* A vdev that's CANT_OPEN or below is considered unusable.
|
|
|
|
*/
|
|
|
|
typedef enum vdev_state {
|
|
|
|
VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
|
|
|
|
VDEV_STATE_CLOSED, /* Not currently open */
|
|
|
|
VDEV_STATE_OFFLINE, /* Not allowed to open */
|
|
|
|
VDEV_STATE_REMOVED, /* Explicitly removed from system */
|
|
|
|
VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
|
|
|
|
VDEV_STATE_FAULTED, /* External request to fault device */
|
|
|
|
VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
|
|
|
|
VDEV_STATE_HEALTHY /* Presumed good */
|
|
|
|
} vdev_state_t;
|
|
|
|
|
|
|
|
#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
|
|
|
|
* of the vdev stats structure uses these constants to distinguish why.
|
|
|
|
*/
|
|
|
|
typedef enum vdev_aux {
|
|
|
|
VDEV_AUX_NONE, /* no error */
|
|
|
|
VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
|
|
|
|
VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
|
|
|
|
VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
|
|
|
|
VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
|
|
|
|
VDEV_AUX_TOO_SMALL, /* vdev size is too small */
|
|
|
|
VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
|
|
|
|
VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
|
|
|
|
VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
|
2012-12-14 03:24:15 +04:00
|
|
|
VDEV_AUX_UNSUP_FEAT, /* unsupported features */
|
2008-11-20 23:01:55 +03:00
|
|
|
VDEV_AUX_SPARED, /* hot spare used in another pool */
|
2008-12-03 23:09:06 +03:00
|
|
|
VDEV_AUX_ERR_EXCEEDED, /* too many errors */
|
|
|
|
VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
|
2010-05-29 00:45:14 +04:00
|
|
|
VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
|
2017-05-19 22:30:16 +03:00
|
|
|
VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */
|
2017-03-29 03:21:11 +03:00
|
|
|
VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
|
2017-05-19 22:30:16 +03:00
|
|
|
VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */
|
|
|
|
VDEV_AUX_ACTIVE, /* vdev active on a different host */
|
2008-11-20 23:01:55 +03:00
|
|
|
} vdev_aux_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pool state. The following states are written to disk as part of the normal
|
|
|
|
* SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
|
|
|
|
* states are software abstractions used at various levels to communicate
|
|
|
|
* pool state.
|
|
|
|
*/
|
|
|
|
typedef enum pool_state {
|
|
|
|
POOL_STATE_ACTIVE = 0, /* In active use */
|
|
|
|
POOL_STATE_EXPORTED, /* Explicitly exported */
|
|
|
|
POOL_STATE_DESTROYED, /* Explicitly destroyed */
|
|
|
|
POOL_STATE_SPARE, /* Reserved for hot spare use */
|
|
|
|
POOL_STATE_L2CACHE, /* Level 2 ARC device */
|
|
|
|
POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
|
|
|
|
POOL_STATE_UNAVAIL, /* Internal libzfs state */
|
|
|
|
POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
|
|
|
|
} pool_state_t;
|
|
|
|
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
/*
|
|
|
|
* mmp state. The following states provide additional detail describing
|
|
|
|
* why a pool couldn't be safely imported.
|
|
|
|
*/
|
|
|
|
typedef enum mmp_state {
|
|
|
|
MMP_STATE_ACTIVE = 0, /* In active use */
|
|
|
|
MMP_STATE_INACTIVE, /* Inactive and safe to import */
|
|
|
|
MMP_STATE_NO_HOSTID /* System hostid is not set */
|
|
|
|
} mmp_state_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Scan Functions.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef enum pool_scan_func {
|
|
|
|
POOL_SCAN_NONE,
|
|
|
|
POOL_SCAN_SCRUB,
|
|
|
|
POOL_SCAN_RESILVER,
|
|
|
|
POOL_SCAN_FUNCS
|
|
|
|
} pool_scan_func_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-07-07 08:16:13 +03:00
|
|
|
/*
|
|
|
|
* Used to control scrub pause and resume.
|
|
|
|
*/
|
|
|
|
typedef enum pool_scrub_cmd {
|
|
|
|
POOL_SCRUB_NORMAL = 0,
|
|
|
|
POOL_SCRUB_PAUSE,
|
|
|
|
POOL_SCRUB_FLAGS_END
|
|
|
|
} pool_scrub_cmd_t;
|
|
|
|
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ZIO types. Needed to interpret vdev statistics below.
|
|
|
|
*/
|
|
|
|
typedef enum zio_type {
|
|
|
|
ZIO_TYPE_NULL = 0,
|
|
|
|
ZIO_TYPE_READ,
|
|
|
|
ZIO_TYPE_WRITE,
|
|
|
|
ZIO_TYPE_FREE,
|
|
|
|
ZIO_TYPE_CLAIM,
|
|
|
|
ZIO_TYPE_IOCTL,
|
|
|
|
ZIO_TYPES
|
|
|
|
} zio_type_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Pool statistics. Note: all fields should be 64-bit because this
|
|
|
|
* is passed between kernel and userland as an nvlist uint64 array.
|
|
|
|
*/
|
|
|
|
typedef struct pool_scan_stat {
|
|
|
|
/* values stored on disk */
|
|
|
|
uint64_t pss_func; /* pool_scan_func_t */
|
|
|
|
uint64_t pss_state; /* dsl_scan_state_t */
|
|
|
|
uint64_t pss_start_time; /* scan start time */
|
|
|
|
uint64_t pss_end_time; /* scan end time */
|
|
|
|
uint64_t pss_to_examine; /* total bytes to scan */
|
|
|
|
uint64_t pss_examined; /* total examined bytes */
|
|
|
|
uint64_t pss_to_process; /* total bytes to process */
|
|
|
|
uint64_t pss_processed; /* total processed bytes */
|
|
|
|
uint64_t pss_errors; /* scan errors */
|
|
|
|
|
|
|
|
/* values not stored on disk */
|
|
|
|
uint64_t pss_pass_exam; /* examined bytes per scan pass */
|
|
|
|
uint64_t pss_pass_start; /* start time of a scan pass */
|
2017-07-07 08:16:13 +03:00
|
|
|
uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
|
|
|
|
/* cumulative time scrub spent paused, needed for rate calculation */
|
|
|
|
uint64_t pss_pass_scrub_spent_paused;
|
2010-05-29 00:45:14 +04:00
|
|
|
} pool_scan_stat_t;
|
|
|
|
|
|
|
|
typedef enum dsl_scan_state {
|
|
|
|
DSS_NONE,
|
|
|
|
DSS_SCANNING,
|
|
|
|
DSS_FINISHED,
|
|
|
|
DSS_CANCELED,
|
|
|
|
DSS_NUM_STATES
|
|
|
|
} dsl_scan_state_t;
|
|
|
|
|
2014-02-21 07:57:17 +04:00
|
|
|
/*
|
|
|
|
* Errata described by http://zfsonlinux.org/msg/ZFS-8000-ER. The ordering
|
|
|
|
* of this enum must be maintained to ensure the errata identifiers map to
|
|
|
|
* the correct documentation. New errata may only be appended to the list
|
|
|
|
* and must contain corresponding documentation at the above link.
|
|
|
|
*/
|
|
|
|
typedef enum zpool_errata {
|
|
|
|
ZPOOL_ERRATA_NONE,
|
Add erratum for issue #2094
ZoL commit 1421c89 unintentionally changed the disk format in a forward-
compatible, but not backward compatible way. This was accomplished by
adding an entry to zbookmark_t, which is included in a couple of
on-disk structures. That lead to the creation of pools with incorrect
dsl_scan_phys_t objects that could only be imported by versions of ZoL
containing that commit. Such pools cannot be imported by other versions
of ZFS or past versions of ZoL.
The additional field has been removed by the previous commit. However,
affected pools must be imported and scrubbed using a version of ZoL with
this commit applied. This will return the pools to a state in which they
may be imported by other implementations.
The 'zpool import' or 'zpool status' command can be used to determine if
a pool is impacted. A message similar to one of the following means your
pool must be scrubbed to restore compatibility.
$ zpool import
pool: zol-0.6.2-173
id: 1165955789558693437
state: ONLINE
status: Errata #1 detected.
action: The pool can be imported using its name or numeric identifier,
however there is a compatibility issue which should be corrected
by running 'zpool scrub'
see: http://zfsonlinux.org/msg/ZFS-8000-ER
config:
...
$ zpool status
pool: zol-0.6.2-173
state: ONLINE
scan: pool compatibility issue detected.
see: https://github.com/zfsonlinux/zfs/issues/2094
action: To correct the issue run 'zpool scrub'.
config:
...
If there was an async destroy in progress 'zpool import' will prevent
the pool from being imported. Further advice on how to proceed will be
provided by the error message as follows.
$ zpool import
pool: zol-0.6.2-173
id: 1165955789558693437
state: ONLINE
status: Errata #2 detected.
action: The pool can not be imported with this version of ZFS due to an
active asynchronous destroy. Revert to an earlier version and
allow the destroy to complete before updating.
see: http://zfsonlinux.org/msg/ZFS-8000-ER
config:
...
Pools affected by the damaged dsl_scan_phys_t can be detected prior to
an upgrade by running the following command as root:
zdb -dddd poolname 1 | grep -P '^\t\tscan = ' | sed -e 's;scan = ;;' | wc -w
Note that `poolname` must be replaced with the name of the pool you wish
to check. A value of 25 indicates the dsl_scan_phys_t has been damaged.
A value of 24 indicates that the dsl_scan_phys_t is normal. A value of 0
indicates that there has never been a scrub run on the pool.
The regression caused by the change to zbookmark_t never made it into a
tagged release, Gentoo backports, Ubuntu, Debian, Fedora, or EPEL
stable respositorys. Only those using the HEAD version directly from
Github after the 0.6.2 but before the 0.6.3 tag are affected.
This patch does have one limitation that should be mentioned. It will not
detect errata #2 on a pool unless errata #1 is also present. It expected
this will not be a significant problem because pools impacted by errata #2
have a high probably of being impacted by errata #1.
End users can ensure they do no hit this unlikely case by waiting for all
asynchronous destroy operations to complete before updating ZoL. The
presence of any background destroys on any imported pools can be checked
by running `zpool get freeing` as root. This will display a non-zero
value for any pool with an active asynchronous destroy.
Lastly, it is expected that no user data has been lost as a result of
this erratum.
Original-patch-by: Tim Chase <tim@chase2k.com>
Reworked-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2094
2014-02-21 08:28:33 +04:00
|
|
|
ZPOOL_ERRATA_ZOL_2094_SCRUB,
|
|
|
|
ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY,
|
2014-02-21 07:57:17 +04:00
|
|
|
} zpool_errata_t;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Vdev statistics. Note: all fields should be 64-bit because this
|
|
|
|
* is passed between kernel and userland as an nvlist uint64 array.
|
|
|
|
*/
|
|
|
|
typedef struct vdev_stat {
|
|
|
|
hrtime_t vs_timestamp; /* time since vdev load */
|
|
|
|
uint64_t vs_state; /* vdev state */
|
|
|
|
uint64_t vs_aux; /* see vdev_aux_t */
|
|
|
|
uint64_t vs_alloc; /* space allocated */
|
|
|
|
uint64_t vs_space; /* total capacity */
|
|
|
|
uint64_t vs_dspace; /* deflated capacity */
|
|
|
|
uint64_t vs_rsize; /* replaceable dev size */
|
2012-01-24 06:43:32 +04:00
|
|
|
uint64_t vs_esize; /* expandable dev size */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t vs_ops[ZIO_TYPES]; /* operation count */
|
|
|
|
uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
|
|
|
|
uint64_t vs_read_errors; /* read errors */
|
|
|
|
uint64_t vs_write_errors; /* write errors */
|
|
|
|
uint64_t vs_checksum_errors; /* checksum errors */
|
|
|
|
uint64_t vs_self_healed; /* self-healed bytes */
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t vs_scan_removing; /* removing? */
|
|
|
|
uint64_t vs_scan_processed; /* scan processed bytes */
|
2014-07-20 00:19:24 +04:00
|
|
|
uint64_t vs_fragmentation; /* device fragmentation */
|
2016-02-29 21:05:23 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
} vdev_stat_t;
|
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
/*
|
|
|
|
* Extended stats
|
|
|
|
*
|
|
|
|
* These are stats which aren't included in the original iostat output. For
|
|
|
|
* convenience, they are grouped together in vdev_stat_ex, although each stat
|
2016-08-08 11:00:08 +03:00
|
|
|
* is individually exported as an nvlist.
|
2016-02-29 21:05:23 +03:00
|
|
|
*/
|
|
|
|
typedef struct vdev_stat_ex {
|
|
|
|
/* Number of ZIOs issued to disk and waiting to finish */
|
|
|
|
uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
|
|
|
|
|
|
|
|
/* Number of ZIOs pending to be issued to disk */
|
|
|
|
uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Below are the histograms for various latencies. Buckets are in
|
|
|
|
* units of nanoseconds.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 2^37 nanoseconds = 134s. Timeouts will probably start kicking in
|
|
|
|
* before this.
|
|
|
|
*/
|
2016-05-26 00:21:35 +03:00
|
|
|
#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */
|
|
|
|
#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */
|
|
|
|
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Amount of time in ZIO queue (ns) */
|
|
|
|
uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
|
2016-05-26 00:21:35 +03:00
|
|
|
[VDEV_L_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Total ZIO latency (ns). Includes queuing and disk access time */
|
2016-05-26 00:21:35 +03:00
|
|
|
uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
/* Amount of time to read/write the disk (ns) */
|
2016-05-26 00:21:35 +03:00
|
|
|
uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS];
|
|
|
|
|
|
|
|
/* "lookup the bucket for a value" histogram macros */
|
|
|
|
#define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \
|
|
|
|
buckets - 1) : 0)
|
|
|
|
#define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS)
|
|
|
|
#define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS)
|
|
|
|
|
|
|
|
/* Physical IO histogram */
|
|
|
|
uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
|
|
|
|
[VDEV_RQ_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
2016-05-26 00:21:35 +03:00
|
|
|
/* Delegated (aggregated) physical IO histogram */
|
|
|
|
uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE]
|
|
|
|
[VDEV_RQ_HISTO_BUCKETS];
|
2016-02-29 21:05:23 +03:00
|
|
|
|
|
|
|
} vdev_stat_ex_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* DDT statistics. Note: all fields should be 64-bit because this
|
|
|
|
* is passed between kernel and userland as an nvlist uint64 array.
|
|
|
|
*/
|
|
|
|
typedef struct ddt_object {
|
2017-01-03 20:31:18 +03:00
|
|
|
uint64_t ddo_count; /* number of elements in ddt */
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t ddo_dspace; /* size of ddt on disk */
|
|
|
|
uint64_t ddo_mspace; /* size of ddt in-core */
|
|
|
|
} ddt_object_t;
|
|
|
|
|
|
|
|
typedef struct ddt_stat {
|
|
|
|
uint64_t dds_blocks; /* blocks */
|
|
|
|
uint64_t dds_lsize; /* logical size */
|
|
|
|
uint64_t dds_psize; /* physical size */
|
|
|
|
uint64_t dds_dsize; /* deflated allocated size */
|
|
|
|
uint64_t dds_ref_blocks; /* referenced blocks */
|
|
|
|
uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
|
|
|
|
uint64_t dds_ref_psize; /* referenced psize * refcnt */
|
|
|
|
uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
|
|
|
|
} ddt_stat_t;
|
|
|
|
|
|
|
|
typedef struct ddt_histogram {
|
|
|
|
ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
|
|
|
|
} ddt_histogram_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZVOL_DRIVER "zvol"
|
|
|
|
#define ZFS_DRIVER "zfs"
|
|
|
|
#define ZFS_DEV "/dev/zfs"
|
2016-11-01 02:56:10 +03:00
|
|
|
#define ZFS_SHARETAB "/etc/dfs/sharetab"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-01-26 23:28:29 +03:00
|
|
|
#define ZFS_SUPER_MAGIC 0x2fc12fc1
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/* general zvol path */
|
2010-08-26 22:45:02 +04:00
|
|
|
#define ZVOL_DIR "/dev"
|
|
|
|
|
|
|
|
#define ZVOL_MAJOR 230
|
|
|
|
#define ZVOL_MINOR_BITS 4
|
|
|
|
#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1)
|
|
|
|
#define ZVOL_MINORS (1 << 4)
|
2011-02-22 13:58:44 +03:00
|
|
|
#define ZVOL_DEV_NAME "zd"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#define ZVOL_PROP_NAME "name"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZVOL_DEFAULT_BLOCKSIZE 8192
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* /dev/zfs ioctl numbers.
|
|
|
|
*/
|
|
|
|
typedef enum zfs_ioc {
|
2013-12-14 02:49:33 +04:00
|
|
|
/*
|
2016-06-10 03:04:12 +03:00
|
|
|
* Illumos - 71/128 numbers reserved.
|
2013-12-14 02:49:33 +04:00
|
|
|
*/
|
2013-08-28 15:45:09 +04:00
|
|
|
ZFS_IOC_FIRST = ('Z' << 8),
|
|
|
|
ZFS_IOC = ZFS_IOC_FIRST,
|
|
|
|
ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_IOC_POOL_DESTROY,
|
|
|
|
ZFS_IOC_POOL_IMPORT,
|
|
|
|
ZFS_IOC_POOL_EXPORT,
|
|
|
|
ZFS_IOC_POOL_CONFIGS,
|
|
|
|
ZFS_IOC_POOL_STATS,
|
|
|
|
ZFS_IOC_POOL_TRYIMPORT,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_IOC_POOL_SCAN,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_IOC_POOL_FREEZE,
|
|
|
|
ZFS_IOC_POOL_UPGRADE,
|
|
|
|
ZFS_IOC_POOL_GET_HISTORY,
|
|
|
|
ZFS_IOC_VDEV_ADD,
|
|
|
|
ZFS_IOC_VDEV_REMOVE,
|
|
|
|
ZFS_IOC_VDEV_SET_STATE,
|
|
|
|
ZFS_IOC_VDEV_ATTACH,
|
|
|
|
ZFS_IOC_VDEV_DETACH,
|
|
|
|
ZFS_IOC_VDEV_SETPATH,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZFS_IOC_VDEV_SETFRU,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_IOC_OBJSET_STATS,
|
|
|
|
ZFS_IOC_OBJSET_ZPLPROPS,
|
|
|
|
ZFS_IOC_DATASET_LIST_NEXT,
|
|
|
|
ZFS_IOC_SNAPSHOT_LIST_NEXT,
|
|
|
|
ZFS_IOC_SET_PROP,
|
|
|
|
ZFS_IOC_CREATE,
|
|
|
|
ZFS_IOC_DESTROY,
|
|
|
|
ZFS_IOC_ROLLBACK,
|
|
|
|
ZFS_IOC_RENAME,
|
|
|
|
ZFS_IOC_RECV,
|
|
|
|
ZFS_IOC_SEND,
|
|
|
|
ZFS_IOC_INJECT_FAULT,
|
|
|
|
ZFS_IOC_CLEAR_FAULT,
|
|
|
|
ZFS_IOC_INJECT_LIST_NEXT,
|
|
|
|
ZFS_IOC_ERROR_LOG,
|
|
|
|
ZFS_IOC_CLEAR,
|
|
|
|
ZFS_IOC_PROMOTE,
|
|
|
|
ZFS_IOC_SNAPSHOT,
|
|
|
|
ZFS_IOC_DSOBJ_TO_DSNAME,
|
|
|
|
ZFS_IOC_OBJ_TO_PATH,
|
|
|
|
ZFS_IOC_POOL_SET_PROPS,
|
|
|
|
ZFS_IOC_POOL_GET_PROPS,
|
|
|
|
ZFS_IOC_SET_FSACL,
|
|
|
|
ZFS_IOC_GET_FSACL,
|
|
|
|
ZFS_IOC_SHARE,
|
2009-07-03 02:44:48 +04:00
|
|
|
ZFS_IOC_INHERIT_PROP,
|
|
|
|
ZFS_IOC_SMB_ACL,
|
|
|
|
ZFS_IOC_USERSPACE_ONE,
|
|
|
|
ZFS_IOC_USERSPACE_MANY,
|
2009-08-18 22:43:27 +04:00
|
|
|
ZFS_IOC_USERSPACE_UPGRADE,
|
|
|
|
ZFS_IOC_HOLD,
|
|
|
|
ZFS_IOC_RELEASE,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_IOC_GET_HOLDS,
|
|
|
|
ZFS_IOC_OBJSET_RECVD_PROPS,
|
2010-08-27 01:24:34 +04:00
|
|
|
ZFS_IOC_VDEV_SPLIT,
|
|
|
|
ZFS_IOC_NEXT_OBJ,
|
|
|
|
ZFS_IOC_DIFF,
|
|
|
|
ZFS_IOC_TMP_SNAPSHOT,
|
2010-08-26 22:42:43 +04:00
|
|
|
ZFS_IOC_OBJ_TO_STATS,
|
2011-11-17 22:14:36 +04:00
|
|
|
ZFS_IOC_SPACE_WRITTEN,
|
|
|
|
ZFS_IOC_SPACE_SNAPS,
|
2013-12-14 02:49:33 +04:00
|
|
|
ZFS_IOC_DESTROY_SNAPS,
|
|
|
|
ZFS_IOC_POOL_REGUID,
|
2012-01-24 06:43:32 +04:00
|
|
|
ZFS_IOC_POOL_REOPEN,
|
2012-05-10 02:05:14 +04:00
|
|
|
ZFS_IOC_SEND_PROGRESS,
|
2013-08-28 15:45:09 +04:00
|
|
|
ZFS_IOC_LOG_HISTORY,
|
|
|
|
ZFS_IOC_SEND_NEW,
|
|
|
|
ZFS_IOC_SEND_SPACE,
|
|
|
|
ZFS_IOC_CLONE,
|
2013-12-12 02:33:41 +04:00
|
|
|
ZFS_IOC_BOOKMARK,
|
|
|
|
ZFS_IOC_GET_BOOKMARKS,
|
|
|
|
ZFS_IOC_DESTROY_BOOKMARKS,
|
2016-06-10 03:04:12 +03:00
|
|
|
ZFS_IOC_RECV_NEW,
|
2017-05-19 22:33:11 +03:00
|
|
|
ZFS_IOC_POOL_SYNC,
|
2013-12-14 02:49:33 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Linux - 3/64 numbers reserved.
|
|
|
|
*/
|
|
|
|
ZFS_IOC_LINUX = ('Z' << 8) + 0x80,
|
|
|
|
ZFS_IOC_EVENTS_NEXT,
|
|
|
|
ZFS_IOC_EVENTS_CLEAR,
|
2013-11-23 02:52:16 +04:00
|
|
|
ZFS_IOC_EVENTS_SEEK,
|
2013-12-14 02:49:33 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* FreeBSD - 1/64 numbers reserved.
|
|
|
|
*/
|
|
|
|
ZFS_IOC_FREEBSD = ('Z' << 8) + 0xC0,
|
|
|
|
|
2013-08-28 15:45:09 +04:00
|
|
|
ZFS_IOC_LAST
|
2008-11-20 23:01:55 +03:00
|
|
|
} zfs_ioc_t;
|
|
|
|
|
2011-02-22 13:58:44 +03:00
|
|
|
/*
|
|
|
|
* zvol ioctl to get dataset name
|
|
|
|
*/
|
2016-06-16 00:28:36 +03:00
|
|
|
#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN])
|
2011-02-22 13:58:44 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Internal SPA load state. Used by FMA diagnosis engine.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
2010-05-29 00:45:14 +04:00
|
|
|
SPA_LOAD_NONE, /* no load in progress */
|
|
|
|
SPA_LOAD_OPEN, /* normal open */
|
|
|
|
SPA_LOAD_IMPORT, /* import in progress */
|
|
|
|
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
|
|
|
|
SPA_LOAD_RECOVER, /* recovery requested */
|
2016-10-14 03:59:18 +03:00
|
|
|
SPA_LOAD_ERROR, /* load failed */
|
|
|
|
SPA_LOAD_CREATE /* creation in progress */
|
2008-11-20 23:01:55 +03:00
|
|
|
} spa_load_state_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bookmark name values.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_ERR_LIST "error list"
|
|
|
|
#define ZPOOL_ERR_DATASET "dataset"
|
|
|
|
#define ZPOOL_ERR_OBJECT "object"
|
|
|
|
|
|
|
|
#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following are names used in the nvlist describing
|
|
|
|
* the pool's history log.
|
|
|
|
*/
|
|
|
|
#define ZPOOL_HIST_RECORD "history record"
|
|
|
|
#define ZPOOL_HIST_TIME "history time"
|
|
|
|
#define ZPOOL_HIST_CMD "history command"
|
|
|
|
#define ZPOOL_HIST_WHO "history who"
|
|
|
|
#define ZPOOL_HIST_ZONE "history zone"
|
|
|
|
#define ZPOOL_HIST_HOST "history hostname"
|
|
|
|
#define ZPOOL_HIST_TXG "history txg"
|
|
|
|
#define ZPOOL_HIST_INT_EVENT "history internal event"
|
|
|
|
#define ZPOOL_HIST_INT_STR "history internal str"
|
2013-08-28 15:45:09 +04:00
|
|
|
#define ZPOOL_HIST_INT_NAME "internal_name"
|
|
|
|
#define ZPOOL_HIST_IOCTL "ioctl"
|
|
|
|
#define ZPOOL_HIST_INPUT_NVL "in_nvl"
|
|
|
|
#define ZPOOL_HIST_OUTPUT_NVL "out_nvl"
|
|
|
|
#define ZPOOL_HIST_DSNAME "dsname"
|
|
|
|
#define ZPOOL_HIST_DSID "dsid"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags for ZFS_IOC_VDEV_SET_STATE
|
|
|
|
*/
|
|
|
|
#define ZFS_ONLINE_CHECKREMOVE 0x1
|
|
|
|
#define ZFS_ONLINE_UNSPARE 0x2
|
|
|
|
#define ZFS_ONLINE_FORCEFAULT 0x4
|
2009-07-03 02:44:48 +04:00
|
|
|
#define ZFS_ONLINE_EXPAND 0x8
|
2008-11-20 23:01:55 +03:00
|
|
|
#define ZFS_OFFLINE_TEMPORARY 0x1
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* Flags for ZFS_IOC_POOL_IMPORT
|
|
|
|
*/
|
|
|
|
#define ZFS_IMPORT_NORMAL 0x0
|
|
|
|
#define ZFS_IMPORT_VERBATIM 0x1
|
|
|
|
#define ZFS_IMPORT_ANY_HOST 0x2
|
|
|
|
#define ZFS_IMPORT_MISSING_LOG 0x4
|
|
|
|
#define ZFS_IMPORT_ONLY 0x8
|
2013-07-01 18:57:04 +04:00
|
|
|
#define ZFS_IMPORT_TEMP_NAME 0x10
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
#define ZFS_IMPORT_SKIP_MMP 0x20
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Sysevent payload members. ZFS will generate the following sysevents with the
|
|
|
|
* given payloads:
|
|
|
|
*
|
|
|
|
* ESC_ZFS_RESILVER_START
|
|
|
|
* ESC_ZFS_RESILVER_END
|
|
|
|
* ESC_ZFS_POOL_DESTROY
|
2011-11-12 02:07:54 +04:00
|
|
|
* ESC_ZFS_POOL_REGUID
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
*
|
|
|
|
* ESC_ZFS_VDEV_REMOVE
|
|
|
|
* ESC_ZFS_VDEV_CLEAR
|
|
|
|
* ESC_ZFS_VDEV_CHECK
|
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
* ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_VDEV_GUID DATA_TYPE_UINT64
|
2017-05-30 21:39:17 +03:00
|
|
|
*
|
|
|
|
* ESC_ZFS_HISTORY_EVENT
|
|
|
|
*
|
|
|
|
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
|
|
|
|
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
|
|
|
|
* ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional)
|
|
|
|
* ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional)
|
|
|
|
* ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional)
|
|
|
|
*
|
|
|
|
* The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the
|
|
|
|
* history log nvlist. The keynames will be free of any spaces or other
|
|
|
|
* characters that could be potentially unexpected to consumers of the
|
|
|
|
* sysevents.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
#define ZFS_EV_POOL_NAME "pool_name"
|
|
|
|
#define ZFS_EV_POOL_GUID "pool_guid"
|
|
|
|
#define ZFS_EV_VDEV_PATH "vdev_path"
|
|
|
|
#define ZFS_EV_VDEV_GUID "vdev_guid"
|
2017-05-30 21:39:17 +03:00
|
|
|
#define ZFS_EV_HIST_TIME "history_time"
|
|
|
|
#define ZFS_EV_HIST_CMD "history_command"
|
|
|
|
#define ZFS_EV_HIST_WHO "history_who"
|
|
|
|
#define ZFS_EV_HIST_ZONE "history_zone"
|
|
|
|
#define ZFS_EV_HIST_HOST "history_hostname"
|
|
|
|
#define ZFS_EV_HIST_TXG "history_txg"
|
|
|
|
#define ZFS_EV_HIST_INT_EVENT "history_internal_event"
|
|
|
|
#define ZFS_EV_HIST_INT_STR "history_internal_str"
|
|
|
|
#define ZFS_EV_HIST_INT_NAME "history_internal_name"
|
|
|
|
#define ZFS_EV_HIST_IOCTL "history_ioctl"
|
|
|
|
#define ZFS_EV_HIST_DSNAME "history_dsname"
|
|
|
|
#define ZFS_EV_HIST_DSID "history_dsid"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_FS_ZFS_H */
|