2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2008-11-20 23:01:55 +03:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2020-04-23 20:06:57 +03:00
|
|
|
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
|
2013-10-07 14:53:58 +04:00
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
2012-05-10 02:05:14 +04:00
|
|
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
2014-09-12 07:28:35 +04:00
|
|
|
* Copyright 2014 HybridCluster. All rights reserved.
|
2015-04-02 06:44:32 +03:00
|
|
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
2016-06-16 01:47:05 +03:00
|
|
|
* Copyright 2013 Saso Kiselkov. All rights reserved.
|
2018-09-06 04:33:36 +03:00
|
|
|
* Copyright (c) 2017, Intel Corporation.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/* Portions Copyright 2010 Robert Milkowski */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifndef _SYS_DMU_H
|
|
|
|
#define _SYS_DMU_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This file describes the interface that the DMU provides for its
|
|
|
|
* consumers.
|
|
|
|
*
|
|
|
|
* The DMU also interacts with the SPA. That interface is described in
|
|
|
|
* dmu_spa.h.
|
|
|
|
*/
|
|
|
|
|
2015-04-02 06:44:32 +03:00
|
|
|
#include <sys/zfs_context.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/inttypes.h>
|
|
|
|
#include <sys/cred.h>
|
2013-08-28 15:45:09 +04:00
|
|
|
#include <sys/fs/zfs.h>
|
2016-07-11 20:45:52 +03:00
|
|
|
#include <sys/zio_compress.h>
|
2015-12-22 04:31:57 +03:00
|
|
|
#include <sys/zio_priority.h>
|
2010-08-26 21:26:05 +04:00
|
|
|
#include <sys/uio.h>
|
2019-11-21 20:32:57 +03:00
|
|
|
#include <sys/zfs_file.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct page;
|
|
|
|
struct vnode;
|
|
|
|
struct spa;
|
|
|
|
struct zilog;
|
|
|
|
struct zio;
|
|
|
|
struct blkptr;
|
|
|
|
struct zap_cursor;
|
|
|
|
struct dsl_dataset;
|
|
|
|
struct dsl_pool;
|
|
|
|
struct dnode;
|
|
|
|
struct drr_begin;
|
|
|
|
struct drr_end;
|
2014-06-25 22:37:59 +04:00
|
|
|
struct zbookmark_phys;
|
2008-11-20 23:01:55 +03:00
|
|
|
struct spa;
|
|
|
|
struct nvlist;
|
2009-07-03 02:44:48 +04:00
|
|
|
struct arc_buf;
|
2010-05-29 00:45:14 +04:00
|
|
|
struct zio_prop;
|
|
|
|
struct sa_handle;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
struct dsl_crypto_params;
|
2018-10-02 01:13:12 +03:00
|
|
|
struct locked_range;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef struct objset objset_t;
|
|
|
|
typedef struct dmu_tx dmu_tx_t;
|
|
|
|
typedef struct dsl_dir dsl_dir_t;
|
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the
same directory, on a machine with 16 CPU cores, I observed poor
performance. I noticed that dmu_tx_hold_zap() was using about 30% of
all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
object that is being held).
dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
dnode_t that we already have in hand, rather than repeatedly calling
dnode_hold(). To do this, we need to pass the dnode_t down through
all the intermediate calls that dmu_tx_hold_zap() makes, making these
routines take the dnode_t* rather than an objset_t* and a uint64_t
object number. In particular, the following routines will need to have
analogous *_by_dnode() variants created:
dmu_buf_hold_noread()
dmu_buf_hold()
zap_lookup()
zap_lookup_norm()
zap_count_write()
zap_lockdir()
zap_count_write()
This can improve performance on the benchmark described above by 100%,
from 30,000 file creations per second to 60,000. (This improvement is on
top of that provided by working around the object allocation issue. Peak
performance of ~90,000 creations per second was observed with 8 CPUs;
adding CPUs past that decreased performance due to lock contention.) The
CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
to 40 CPU-seconds.
Sponsored by: Intel Corp.
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7004
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109
Closes #4641
Closes #4972
2016-07-21 01:42:13 +03:00
|
|
|
typedef struct dnode dnode_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
typedef enum dmu_object_byteswap {
|
|
|
|
DMU_BSWAP_UINT8,
|
|
|
|
DMU_BSWAP_UINT16,
|
|
|
|
DMU_BSWAP_UINT32,
|
|
|
|
DMU_BSWAP_UINT64,
|
|
|
|
DMU_BSWAP_ZAP,
|
|
|
|
DMU_BSWAP_DNODE,
|
|
|
|
DMU_BSWAP_OBJSET,
|
|
|
|
DMU_BSWAP_ZNODE,
|
|
|
|
DMU_BSWAP_OLDACL,
|
|
|
|
DMU_BSWAP_ACL,
|
|
|
|
/*
|
|
|
|
* Allocating a new byteswap type number makes the on-disk format
|
|
|
|
* incompatible with any other format that uses the same number.
|
|
|
|
*
|
|
|
|
* Data can usually be structured to work with one of the
|
|
|
|
* DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
|
|
|
|
*/
|
|
|
|
DMU_BSWAP_NUMFUNCS
|
|
|
|
} dmu_object_byteswap_t;
|
|
|
|
|
|
|
|
#define DMU_OT_NEWTYPE 0x80
|
|
|
|
#define DMU_OT_METADATA 0x40
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#define DMU_OT_ENCRYPTED 0x20
|
|
|
|
#define DMU_OT_BYTESWAP_MASK 0x1f
|
2012-12-14 03:24:15 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Defines a uint8_t object type. Object types specify if the data
|
|
|
|
* in the object is metadata (boolean) and how to byteswap the data
|
2018-07-10 20:49:50 +03:00
|
|
|
* (dmu_object_byteswap_t). All of the types created by this method
|
|
|
|
* are cached in the dbuf metadata cache.
|
2012-12-14 03:24:15 +04:00
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#define DMU_OT(byteswap, metadata, encrypted) \
|
2012-12-14 03:24:15 +04:00
|
|
|
(DMU_OT_NEWTYPE | \
|
|
|
|
((metadata) ? DMU_OT_METADATA : 0) | \
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
((encrypted) ? DMU_OT_ENCRYPTED : 0) | \
|
2012-12-14 03:24:15 +04:00
|
|
|
((byteswap) & DMU_OT_BYTESWAP_MASK))
|
|
|
|
|
|
|
|
#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
|
|
|
|
((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
|
|
|
|
(ot) < DMU_OT_NUMTYPES)
|
|
|
|
|
2018-07-10 20:49:50 +03:00
|
|
|
#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
|
|
|
|
B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
|
|
|
|
|
2018-04-06 23:30:26 +03:00
|
|
|
/*
|
|
|
|
* MDB doesn't have dmu_ot; it defines these macros itself.
|
|
|
|
*/
|
|
|
|
#ifndef ZFS_MDB
|
|
|
|
#define DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata)
|
|
|
|
#define DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt)
|
|
|
|
#define DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap)
|
|
|
|
#endif
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
|
|
|
|
((ot) & DMU_OT_METADATA) : \
|
2018-04-06 23:30:26 +03:00
|
|
|
DMU_OT_IS_METADATA_IMPL(ot))
|
2012-12-14 03:24:15 +04:00
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
#define DMU_OT_IS_DDT(ot) \
|
|
|
|
((ot) == DMU_OT_DDT_ZAP)
|
|
|
|
|
|
|
|
/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
|
|
|
|
#define DMU_OT_IS_FILE(ot) \
|
|
|
|
((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
|
|
|
|
((ot) & DMU_OT_ENCRYPTED) : \
|
2018-04-06 23:30:26 +03:00
|
|
|
DMU_OT_IS_ENCRYPTED_IMPL(ot))
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
/*
|
|
|
|
* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
|
|
|
|
* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
|
|
|
|
* is repurposed for embedded BPs.
|
|
|
|
*/
|
|
|
|
#define DMU_OT_HAS_FILL(ot) \
|
|
|
|
((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
|
|
|
|
((ot) & DMU_OT_BYTESWAP_MASK) : \
|
2018-04-06 23:30:26 +03:00
|
|
|
DMU_OT_BYTESWAP_IMPL(ot))
|
2012-12-14 03:24:15 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef enum dmu_object_type {
|
|
|
|
DMU_OT_NONE,
|
|
|
|
/* general: */
|
|
|
|
DMU_OT_OBJECT_DIRECTORY, /* ZAP */
|
|
|
|
DMU_OT_OBJECT_ARRAY, /* UINT64 */
|
|
|
|
DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
|
|
|
|
DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
|
2010-05-29 00:45:14 +04:00
|
|
|
DMU_OT_BPOBJ, /* UINT64 */
|
|
|
|
DMU_OT_BPOBJ_HDR, /* UINT64 */
|
2008-11-20 23:01:55 +03:00
|
|
|
/* spa: */
|
|
|
|
DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
|
|
|
|
DMU_OT_SPACE_MAP, /* UINT64 */
|
|
|
|
/* zil: */
|
|
|
|
DMU_OT_INTENT_LOG, /* UINT64 */
|
|
|
|
/* dmu: */
|
|
|
|
DMU_OT_DNODE, /* DNODE */
|
|
|
|
DMU_OT_OBJSET, /* OBJSET */
|
|
|
|
/* dsl: */
|
|
|
|
DMU_OT_DSL_DIR, /* UINT64 */
|
|
|
|
DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
|
|
|
|
DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
|
|
|
|
DMU_OT_DSL_PROPS, /* ZAP */
|
|
|
|
DMU_OT_DSL_DATASET, /* UINT64 */
|
|
|
|
/* zpl: */
|
|
|
|
DMU_OT_ZNODE, /* ZNODE */
|
|
|
|
DMU_OT_OLDACL, /* Old ACL */
|
|
|
|
DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
|
|
|
|
DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
|
|
|
|
DMU_OT_MASTER_NODE, /* ZAP */
|
|
|
|
DMU_OT_UNLINKED_SET, /* ZAP */
|
|
|
|
/* zvol: */
|
|
|
|
DMU_OT_ZVOL, /* UINT8 */
|
|
|
|
DMU_OT_ZVOL_PROP, /* ZAP */
|
|
|
|
/* other; for testing only! */
|
|
|
|
DMU_OT_PLAIN_OTHER, /* UINT8 */
|
|
|
|
DMU_OT_UINT64_OTHER, /* UINT64 */
|
|
|
|
DMU_OT_ZAP_OTHER, /* ZAP */
|
|
|
|
/* new object types: */
|
|
|
|
DMU_OT_ERROR_LOG, /* ZAP */
|
|
|
|
DMU_OT_SPA_HISTORY, /* UINT8 */
|
|
|
|
DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
|
|
|
|
DMU_OT_POOL_PROPS, /* ZAP */
|
|
|
|
DMU_OT_DSL_PERMS, /* ZAP */
|
|
|
|
DMU_OT_ACL, /* ACL */
|
|
|
|
DMU_OT_SYSACL, /* SYSACL */
|
|
|
|
DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
|
|
|
|
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
|
2008-12-03 23:09:06 +03:00
|
|
|
DMU_OT_NEXT_CLONES, /* ZAP */
|
2010-05-29 00:45:14 +04:00
|
|
|
DMU_OT_SCAN_QUEUE, /* ZAP */
|
2009-07-03 02:44:48 +04:00
|
|
|
DMU_OT_USERGROUP_USED, /* ZAP */
|
|
|
|
DMU_OT_USERGROUP_QUOTA, /* ZAP */
|
2009-08-18 22:43:27 +04:00
|
|
|
DMU_OT_USERREFS, /* ZAP */
|
2010-05-29 00:45:14 +04:00
|
|
|
DMU_OT_DDT_ZAP, /* ZAP */
|
|
|
|
DMU_OT_DDT_STATS, /* ZAP */
|
|
|
|
DMU_OT_SA, /* System attr */
|
|
|
|
DMU_OT_SA_MASTER_NODE, /* ZAP */
|
|
|
|
DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
|
|
|
|
DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
|
|
|
|
DMU_OT_SCAN_XLATE, /* ZAP */
|
|
|
|
DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
|
|
|
|
DMU_OT_DEADLIST, /* ZAP */
|
|
|
|
DMU_OT_DEADLIST_HDR, /* UINT64 */
|
|
|
|
DMU_OT_DSL_CLONES, /* ZAP */
|
|
|
|
DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
|
2012-12-14 03:24:15 +04:00
|
|
|
/*
|
|
|
|
* Do not allocate new object types here. Doing so makes the on-disk
|
|
|
|
* format incompatible with any other format that uses the same object
|
|
|
|
* type number.
|
|
|
|
*
|
|
|
|
* When creating an object which does not have one of the above types
|
|
|
|
* use the DMU_OTN_* type with the correct byteswap and metadata
|
|
|
|
* values.
|
|
|
|
*
|
|
|
|
* The DMU_OTN_* types do not have entries in the dmu_ot table,
|
2018-09-06 04:33:36 +03:00
|
|
|
* use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
|
2012-12-14 03:24:15 +04:00
|
|
|
* of indexing into dmu_ot directly (this works for both DMU_OT_* types
|
|
|
|
* and DMU_OTN_* types).
|
|
|
|
*/
|
|
|
|
DMU_OT_NUMTYPES,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Names for valid types declared with DMU_OT().
|
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE),
|
|
|
|
DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE),
|
|
|
|
DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE),
|
|
|
|
DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE),
|
|
|
|
DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE),
|
|
|
|
DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE),
|
|
|
|
DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE),
|
|
|
|
DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE),
|
|
|
|
DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE),
|
|
|
|
DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE),
|
|
|
|
|
|
|
|
DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE),
|
|
|
|
DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE),
|
|
|
|
DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE),
|
|
|
|
DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE),
|
|
|
|
DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE),
|
|
|
|
DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE),
|
|
|
|
DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE),
|
|
|
|
DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE),
|
|
|
|
DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE),
|
|
|
|
DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE),
|
2008-11-20 23:01:55 +03:00
|
|
|
} dmu_object_type_t;
|
|
|
|
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
/*
|
|
|
|
* These flags are intended to be used to specify the "txg_how"
|
|
|
|
* parameter when calling the dmu_tx_assign() function. See the comment
|
|
|
|
* above dmu_tx_assign() for more details on the meaning of these flags.
|
|
|
|
*/
|
|
|
|
#define TXG_NOWAIT (0ULL)
|
|
|
|
#define TXG_WAIT (1ULL<<0)
|
|
|
|
#define TXG_NOTHROTTLE (1ULL<<1)
|
2013-09-04 16:00:57 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void byteswap_uint64_array(void *buf, size_t size);
|
|
|
|
void byteswap_uint32_array(void *buf, size_t size);
|
|
|
|
void byteswap_uint16_array(void *buf, size_t size);
|
|
|
|
void byteswap_uint8_array(void *buf, size_t size);
|
|
|
|
void zap_byteswap(void *buf, size_t size);
|
|
|
|
void zfs_oldacl_byteswap(void *buf, size_t size);
|
|
|
|
void zfs_acl_byteswap(void *buf, size_t size);
|
|
|
|
void zfs_znode_byteswap(void *buf, size_t size);
|
|
|
|
|
|
|
|
#define DS_FIND_SNAPSHOTS (1<<0)
|
|
|
|
#define DS_FIND_CHILDREN (1<<1)
|
2015-05-06 19:07:55 +03:00
|
|
|
#define DS_FIND_SERIALIZE (1<<2)
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The maximum number of bytes that can be accessed as part of one
|
|
|
|
* operation, including metadata.
|
|
|
|
*/
|
2014-11-03 23:15:08 +03:00
|
|
|
#define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
|
2008-12-03 23:09:06 +03:00
|
|
|
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
#define DMU_USERUSED_OBJECT (-1ULL)
|
|
|
|
#define DMU_GROUPUSED_OBJECT (-2ULL)
|
2018-02-14 01:54:54 +03:00
|
|
|
#define DMU_PROJECTUSED_OBJECT (-3ULL)
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2016-10-04 21:46:10 +03:00
|
|
|
/*
|
2018-02-14 01:54:54 +03:00
|
|
|
* Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT.
|
2016-10-04 21:46:10 +03:00
|
|
|
*/
|
|
|
|
#define DMU_OBJACCT_PREFIX "obj-"
|
|
|
|
#define DMU_OBJACCT_PREFIX_LEN 4
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* artificial blkids for bonus buffer and spill blocks
|
|
|
|
*/
|
|
|
|
#define DMU_BONUS_BLKID (-1ULL)
|
|
|
|
#define DMU_SPILL_BLKID (-2ULL)
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Public routines to create, destroy, open, and close objsets.
|
|
|
|
*/
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg,
|
|
|
|
cred_t *cr, dmu_tx_t *tx);
|
|
|
|
|
2022-04-19 21:38:30 +03:00
|
|
|
int dmu_objset_hold(const char *name, const void *tag, objset_t **osp);
|
2010-05-29 00:45:14 +04:00
|
|
|
int dmu_objset_own(const char *name, dmu_objset_type_t type,
|
2022-04-19 21:38:30 +03:00
|
|
|
boolean_t readonly, boolean_t key_required, const void *tag,
|
|
|
|
objset_t **osp);
|
|
|
|
void dmu_objset_rele(objset_t *os, const void *tag);
|
|
|
|
void dmu_objset_disown(objset_t *os, boolean_t key_required, const void *tag);
|
2010-05-29 00:45:14 +04:00
|
|
|
int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
void dmu_objset_evict_dbufs(objset_t *os);
|
2010-05-29 00:45:14 +04:00
|
|
|
int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func,
|
|
|
|
void *arg);
|
2013-09-04 16:00:57 +04:00
|
|
|
int dmu_objset_clone(const char *name, const char *origin);
|
|
|
|
int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
|
2013-08-28 15:45:09 +04:00
|
|
|
struct nvlist *errlist);
|
|
|
|
int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
|
2019-09-25 19:20:30 +03:00
|
|
|
int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
|
2008-11-20 23:01:55 +03:00
|
|
|
int flags);
|
|
|
|
void dmu_objset_byteswap(void *buf, size_t size);
|
2013-09-04 16:00:57 +04:00
|
|
|
int dsl_dataset_rename_snapshot(const char *fsname,
|
|
|
|
const char *oldsnapname, const char *newsnapname, boolean_t recursive);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef struct dmu_buf {
|
|
|
|
uint64_t db_object; /* object that this buffer is part of */
|
|
|
|
uint64_t db_offset; /* byte offset in this object */
|
|
|
|
uint64_t db_size; /* size of buffer in bytes */
|
|
|
|
void *db_data; /* data in buffer */
|
|
|
|
} dmu_buf_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
|
|
|
|
*/
|
|
|
|
#define DMU_POOL_DIRECTORY_OBJECT 1
|
|
|
|
#define DMU_POOL_CONFIG "config"
|
2012-12-14 03:24:15 +04:00
|
|
|
#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
|
|
|
|
#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
|
|
|
|
#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
|
2013-12-09 22:37:51 +04:00
|
|
|
#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define DMU_POOL_ROOT_DATASET "root_dataset"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
|
2008-11-20 23:01:55 +03:00
|
|
|
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
|
|
|
|
#define DMU_POOL_ERRLOG_LAST "errlog_last"
|
|
|
|
#define DMU_POOL_SPARES "spares"
|
|
|
|
#define DMU_POOL_DEFLATE "deflate"
|
|
|
|
#define DMU_POOL_HISTORY "history"
|
|
|
|
#define DMU_POOL_PROPS "pool_props"
|
|
|
|
#define DMU_POOL_L2CACHE "l2cache"
|
2010-05-29 00:45:14 +04:00
|
|
|
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
|
|
|
|
#define DMU_POOL_DDT "DDT-%s-%s-%s"
|
|
|
|
#define DMU_POOL_DDT_STATS "DDT-statistics"
|
|
|
|
#define DMU_POOL_CREATION_VERSION "creation_version"
|
|
|
|
#define DMU_POOL_SCAN "scan"
|
|
|
|
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
|
2012-12-14 03:24:15 +04:00
|
|
|
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
|
2012-12-24 03:57:14 +04:00
|
|
|
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
|
2016-06-16 01:47:05 +03:00
|
|
|
#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
|
2016-04-11 23:16:57 +03:00
|
|
|
#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map"
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
#define DMU_POOL_REMOVING "com.delphix:removing"
|
|
|
|
#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
|
|
|
|
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
|
2016-12-17 01:11:29 +03:00
|
|
|
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
|
Log Spacemap Project
= Motivation
At Delphix we've seen a lot of customer systems where fragmentation
is over 75% and random writes take a performance hit because a lot
of time is spend on I/Os that update on-disk space accounting metadata.
Specifically, we seen cases where 20% to 40% of sync time is spend
after sync pass 1 and ~30% of the I/Os on the system is spent updating
spacemaps.
The problem is that these pools have existed long enough that we've
touched almost every metaslab at least once, and random writes
scatter frees across all metaslabs every TXG, thus appending to
their spacemaps and resulting in many I/Os. To give an example,
assuming that every VDEV has 200 metaslabs and our writes fit within
a single spacemap block (generally 4K) we have 200 I/Os. Then if we
assume 2 levels of indirection, we need 400 additional I/Os and
since we are talking about metadata for which we keep 2 extra copies
for redundancy we need to triple that number, leading to a total of
1800 I/Os per VDEV every TXG.
We could try and decrease the number of metaslabs so we have less
I/Os per TXG but then each metaslab would cover a wider range on
disk and thus would take more time to be loaded in memory from disk.
In addition, after it's loaded, it's range tree would consume more
memory.
Another idea would be to just increase the spacemap block size
which would allow us to fit more entries within an I/O block
resulting in fewer I/Os per metaslab and a speedup in loading time.
The problem is still that we don't deal with the number of I/Os
going up as the number of metaslabs is increasing and the fact
is that we generally write a lot to a few metaslabs and a little
to the rest of them. Thus, just increasing the block size would
actually waste bandwidth because we won't be utilizing our bigger
block size.
= About this patch
This patch introduces the Log Spacemap project which provides the
solution to the above problem while taking into account all the
aforementioned tradeoffs. The details on how it achieves that can
be found in the references sections below and in the code (see
Big Theory Statement in spa_log_spacemap.c).
Even though the change is fairly constraint within the metaslab
and lower-level SPA codepaths, there is a side-change that is
user-facing. The change is that VDEV IDs from VDEV holes will no
longer be reused. To give some background and reasoning for this,
when a log device is removed and its VDEV structure was replaced
with a hole (or was compacted; if at the end of the vdev array),
its vdev_id could be reused by devices added after that. Now
with the pool-wide space maps recording the vdev ID, this behavior
can cause problems (e.g. is this entry referring to a segment in
the new vdev or the removed log?). Thus, to simplify things the
ID reuse behavior is gone and now vdev IDs for top-level vdevs
are truly unique within a pool.
= Testing
The illumos implementation of this feature has been used internally
for a year and has been in production for ~6 months. For this patch
specifically there don't seem to be any regressions introduced to
ZTS and I have been running zloop for a week without any related
problems.
= Performance Analysis (Linux Specific)
All performance results and analysis for illumos can be found in
the links of the references. Redoing the same experiments in Linux
gave similar results. Below are the specifics of the Linux run.
After the pool reached stable state the percentage of the time
spent in pass 1 per TXG was 64% on average for the stock bits
while the log spacemap bits stayed at 95% during the experiment
(graph: sdimitro.github.io/img/linux-lsm/PercOfSyncInPassOne.png).
Sync times per TXG were 37.6 seconds on average for the stock
bits and 22.7 seconds for the log spacemap bits (related graph:
sdimitro.github.io/img/linux-lsm/SyncTimePerTXG.png). As a result
the log spacemap bits were able to push more TXGs, which is also
the reason why all graphs quantified per TXG have more entries for
the log spacemap bits.
Another interesting aspect in terms of txg syncs is that the stock
bits had 22% of their TXGs reach sync pass 7, 55% reach sync pass 8,
and 20% reach 9. The log space map bits reached sync pass 4 in 79%
of their TXGs, sync pass 7 in 19%, and sync pass 8 at 1%. This
emphasizes the fact that not only we spend less time on metadata
but we also iterate less times to convergence in spa_sync() dirtying
objects.
[related graphs:
stock- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGStock.png
lsm- sdimitro.github.io/img/linux-lsm/NumberOfPassesPerTXGLSM.png]
Finally, the improvement in IOPs that the userland gains from the
change is approximately 40%. There is a consistent win in IOPS as
you can see from the graphs below but the absolute amount of
improvement that the log spacemap gives varies within each minute
interval.
sdimitro.github.io/img/linux-lsm/StockVsLog3Days.png
sdimitro.github.io/img/linux-lsm/StockVsLog10Hours.png
= Porting to Other Platforms
For people that want to port this commit to other platforms below
is a list of ZoL commits that this patch depends on:
Make zdb results for checkpoint tests consistent
db587941c5ff6dea01932bb78f70db63cf7f38ba
Update vdev_is_spacemap_addressable() for new spacemap encoding
419ba5914552c6185afbe1dd17b3ed4b0d526547
Simplify spa_sync by breaking it up to smaller functions
8dc2197b7b1e4d7ebc1420ea30e51c6541f1d834
Factor metaslab_load_wait() in metaslab_load()
b194fab0fb6caad18711abccaff3c69ad8b3f6d3
Rename range_tree_verify to range_tree_verify_not_present
df72b8bebe0ebac0b20e0750984bad182cb6564a
Change target size of metaslabs from 256GB to 16GB
c853f382db731e15a87512f4ef1101d14d778a55
zdb -L should skip leak detection altogether
21e7cf5da89f55ce98ec1115726b150e19eefe89
vs_alloc can underflow in L2ARC vdevs
7558997d2f808368867ca7e5234e5793446e8f3f
Simplify log vdev removal code
6c926f426a26ffb6d7d8e563e33fc176164175cb
Get rid of space_map_update() for ms_synced_length
425d3237ee88abc53d8522a7139c926d278b4b7f
Introduce auxiliary metaslab histograms
928e8ad47d3478a3d5d01f0dd6ae74a9371af65e
Error path in metaslab_load_impl() forgets to drop ms_sync_lock
8eef997679ba54547f7d361553d21b3291f41ae7
= References
Background, Motivation, and Internals of the Feature
- OpenZFS 2017 Presentation:
youtu.be/jj2IxRkl5bQ
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemaps-project
Flushing Algorithm Internals & Performance Results
(Illumos Specific)
- Blogpost:
sdimitro.github.io/post/zfs-lsm-flushing/
- OpenZFS 2018 Presentation:
youtu.be/x6D2dHRjkxw
- Slides:
slideshare.net/SerapheimNikolaosDim/zfs-log-spacemap-flushing-algorithm
Upstream Delphix Issues:
DLPX-51539, DLPX-59659, DLPX-57783, DLPX-61438, DLPX-41227, DLPX-59320
DLPX-63385
Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Closes #8442
2019-07-16 20:11:49 +03:00
|
|
|
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"
|
2019-07-26 20:54:14 +03:00
|
|
|
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Allocate an object from this objset. The range of object numbers
|
|
|
|
* available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
|
|
|
|
*
|
|
|
|
* The transaction must be assigned to a txg. The newly allocated
|
|
|
|
* object will be "held" in the transaction (ie. you can modify the
|
|
|
|
* newly allocated object in this transaction).
|
|
|
|
*
|
|
|
|
* dmu_object_alloc() chooses an object and returns it in *objectp.
|
|
|
|
*
|
|
|
|
* dmu_object_claim() allocates a specific object number. If that
|
|
|
|
* number is already allocated, it fails and returns EEXIST.
|
|
|
|
*
|
|
|
|
* Return 0 on success, or ENOSPC or EEXIST as specified above.
|
|
|
|
*/
|
|
|
|
uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
|
|
|
|
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
|
2018-01-11 19:54:38 +03:00
|
|
|
uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
|
|
|
|
int indirect_blockshift,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
|
|
|
|
int blocksize, dmu_object_type_t bonus_type, int bonus_len,
|
|
|
|
int dnodesize, dmu_tx_t *tx);
|
2019-01-11 01:37:43 +03:00
|
|
|
uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot,
|
|
|
|
int blocksize, int indirect_blockshift, dmu_object_type_t bonustype,
|
2022-04-19 21:38:30 +03:00
|
|
|
int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag,
|
2019-01-11 01:37:43 +03:00
|
|
|
dmu_tx_t *tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
|
|
|
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
|
|
|
int blocksize, dmu_object_type_t bonus_type, int bonus_len,
|
|
|
|
int dnodesize, dmu_tx_t *tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
2014-09-12 07:28:35 +04:00
|
|
|
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
|
|
|
|
dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
|
2019-05-08 01:18:44 +03:00
|
|
|
int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx);
|
|
|
|
int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Free an object from this objset.
|
|
|
|
*
|
|
|
|
* The object's data will be freed as well (ie. you don't need to call
|
|
|
|
* dmu_free(object, 0, -1, tx)).
|
|
|
|
*
|
|
|
|
* The object need not be held in the transaction.
|
|
|
|
*
|
|
|
|
* If there are any holds on this object's buffers (via dmu_buf_hold()),
|
|
|
|
* or tx holds on the object (via dmu_tx_hold_object()), you can not
|
|
|
|
* free it; it fails and returns EBUSY.
|
|
|
|
*
|
|
|
|
* If the object is not allocated, it fails and returns ENOENT.
|
|
|
|
*
|
|
|
|
* Return 0 on success, or EBUSY or ENOENT as specified above.
|
|
|
|
*/
|
|
|
|
int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the next allocated or free object.
|
|
|
|
*
|
|
|
|
* The objectp parameter is in-out. It will be updated to be the next
|
|
|
|
* object which is allocated. Ignore objects which have not been
|
|
|
|
* modified since txg.
|
|
|
|
*
|
|
|
|
* XXX Can only be called on a objset with no dirty data.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, or ENOENT if there are no more objects.
|
|
|
|
*/
|
|
|
|
int dmu_object_next(objset_t *os, uint64_t *objectp,
|
|
|
|
boolean_t hole, uint64_t txg);
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
/*
|
|
|
|
* Set the number of levels on a dnode. nlevels must be greater than the
|
|
|
|
* current number of levels or an EINVAL will be returned.
|
|
|
|
*/
|
|
|
|
int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Set the data blocksize for an object.
|
|
|
|
*
|
2019-08-30 19:53:15 +03:00
|
|
|
* The object cannot have any blocks allocated beyond the first. If
|
2008-11-20 23:01:55 +03:00
|
|
|
* the first block is allocated already, the new size must be greater
|
|
|
|
* than the current block size. If these conditions are not met,
|
|
|
|
* ENOTSUP will be returned.
|
|
|
|
*
|
|
|
|
* Returns 0 on success, or EBUSY if there are any holds on the object
|
|
|
|
* contents, or ENOTSUP as described above.
|
|
|
|
*/
|
|
|
|
int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
|
|
|
|
int ibs, dmu_tx_t *tx);
|
|
|
|
|
2017-11-08 22:12:59 +03:00
|
|
|
/*
|
|
|
|
* Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
|
2019-03-13 20:52:01 +03:00
|
|
|
* to accommodate the change. When calling this function, the caller must
|
|
|
|
* ensure that the object's nlevels can sufficiently support the new maxblkid.
|
2017-11-08 22:12:59 +03:00
|
|
|
*/
|
|
|
|
int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Set the checksum property on a dnode. The new checksum algorithm will
|
|
|
|
* apply to all newly written blocks; existing blocks will not be affected.
|
|
|
|
*/
|
|
|
|
void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the compress property on a dnode. The new compression algorithm will
|
|
|
|
* apply to all newly written blocks; existing blocks will not be affected.
|
|
|
|
*/
|
|
|
|
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
|
2017-08-24 02:54:24 +03:00
|
|
|
void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
|
2014-06-06 01:19:08 +04:00
|
|
|
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
|
|
|
|
int compressed_size, int byteorder, dmu_tx_t *tx);
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
|
|
|
dmu_tx_t *tx);
|
2014-06-06 01:19:08 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Decide how to write a block: checksum, compression, number of copies, etc.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
#define WP_NOFILL 0x1
|
|
|
|
#define WP_DMU_SYNC 0x2
|
|
|
|
#define WP_SPILL 0x4
|
|
|
|
|
2017-03-23 19:07:27 +03:00
|
|
|
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
|
|
|
|
struct zio_prop *zp);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* The bonus data is accessed more or less like a regular buffer.
|
|
|
|
* You must dmu_bonus_hold() to get the buffer, which will give you a
|
|
|
|
* dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
* data. As with any normal buffer, you must call dmu_buf_will_dirty()
|
|
|
|
* before modifying it, and the
|
2008-11-20 23:01:55 +03:00
|
|
|
* object must be held in an assigned transaction before calling
|
|
|
|
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
|
2013-03-30 06:27:50 +04:00
|
|
|
* buffer as well. You must release what you hold with dmu_buf_rele().
|
2013-06-11 21:12:34 +04:00
|
|
|
*
|
|
|
|
* Returns ENOENT, EIO, or 0.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2022-04-19 21:38:30 +03:00
|
|
|
int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
|
|
|
|
dmu_buf_t **dbp);
|
|
|
|
int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
|
2019-01-11 01:37:43 +03:00
|
|
|
uint32_t flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
int dmu_bonus_max(void);
|
|
|
|
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
|
2010-05-29 00:45:14 +04:00
|
|
|
int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
|
2010-05-29 00:45:14 +04:00
|
|
|
int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special spill buffer support used by "SA" framework
|
|
|
|
*/
|
|
|
|
|
2022-04-19 21:38:30 +03:00
|
|
|
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
|
2018-06-06 20:16:41 +03:00
|
|
|
dmu_buf_t **dbp);
|
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the
same directory, on a machine with 16 CPU cores, I observed poor
performance. I noticed that dmu_tx_hold_zap() was using about 30% of
all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
object that is being held).
dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
dnode_t that we already have in hand, rather than repeatedly calling
dnode_hold(). To do this, we need to pass the dnode_t down through
all the intermediate calls that dmu_tx_hold_zap() makes, making these
routines take the dnode_t* rather than an objset_t* and a uint64_t
object number. In particular, the following routines will need to have
analogous *_by_dnode() variants created:
dmu_buf_hold_noread()
dmu_buf_hold()
zap_lookup()
zap_lookup_norm()
zap_count_write()
zap_lockdir()
zap_count_write()
This can improve performance on the benchmark described above by 100%,
from 30,000 file creations per second to 60,000. (This improvement is on
top of that provided by working around the object allocation issue. Peak
performance of ~90,000 creations per second was observed with 8 CPUs;
adding CPUs past that decreased performance due to lock contention.) The
CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
to 40 CPU-seconds.
Sponsored by: Intel Corp.
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7004
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109
Closes #4641
Closes #4972
2016-07-21 01:42:13 +03:00
|
|
|
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
|
2022-04-19 21:38:30 +03:00
|
|
|
const void *tag, dmu_buf_t **dbp);
|
|
|
|
int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Obtain the DMU buffer from the specified object which contains the
|
|
|
|
* specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
|
|
|
|
* that it will remain in memory. You must release the hold with
|
2013-03-30 06:27:50 +04:00
|
|
|
* dmu_buf_rele(). You must not access the dmu_buf_t after releasing
|
|
|
|
* what you hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
|
|
|
|
* on the returned buffer before reading or writing the buffer's
|
|
|
|
* db_data. The comments for those routines describe what particular
|
|
|
|
* operations are valid after calling them.
|
|
|
|
*
|
|
|
|
* The object number must be a valid, allocated object number.
|
|
|
|
*/
|
|
|
|
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
|
2022-04-19 21:38:30 +03:00
|
|
|
const void *tag, dmu_buf_t **, int flags);
|
2021-10-13 21:01:01 +03:00
|
|
|
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
|
2022-04-19 21:38:30 +03:00
|
|
|
uint64_t length, int read, const void *tag, int *numbufsp,
|
|
|
|
dmu_buf_t ***dbpp);
|
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the
same directory, on a machine with 16 CPU cores, I observed poor
performance. I noticed that dmu_tx_hold_zap() was using about 30% of
all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
object that is being held).
dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
dnode_t that we already have in hand, rather than repeatedly calling
dnode_hold(). To do this, we need to pass the dnode_t down through
all the intermediate calls that dmu_tx_hold_zap() makes, making these
routines take the dnode_t* rather than an objset_t* and a uint64_t
object number. In particular, the following routines will need to have
analogous *_by_dnode() variants created:
dmu_buf_hold_noread()
dmu_buf_hold()
zap_lookup()
zap_lookup_norm()
zap_count_write()
zap_lockdir()
zap_count_write()
This can improve performance on the benchmark described above by 100%,
from 30,000 file creations per second to 60,000. (This improvement is on
top of that provided by working around the object allocation issue. Peak
performance of ~90,000 creations per second was observed with 8 CPUs;
adding CPUs past that decreased performance due to lock contention.) The
CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
to 40 CPU-seconds.
Sponsored by: Intel Corp.
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7004
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109
Closes #4641
Closes #4972
2016-07-21 01:42:13 +03:00
|
|
|
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
|
2022-04-19 21:38:30 +03:00
|
|
|
const void *tag, dmu_buf_t **dbp, int flags);
|
2019-10-11 20:06:18 +03:00
|
|
|
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
|
2022-04-19 21:38:30 +03:00
|
|
|
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
|
2019-10-11 20:06:18 +03:00
|
|
|
dmu_buf_t ***dbpp, uint32_t flags);
|
2015-04-02 14:59:15 +03:00
|
|
|
/*
|
|
|
|
* Add a reference to a dmu buffer that has already been held via
|
|
|
|
* dmu_buf_hold() in the current context.
|
|
|
|
*/
|
2022-04-19 21:49:30 +03:00
|
|
|
void dmu_buf_add_ref(dmu_buf_t *db, const void *tag);
|
2015-04-02 14:59:15 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempt to add a reference to a dmu buffer that is in an unknown state,
|
|
|
|
* using a pointer that may have been invalidated by eviction processing.
|
|
|
|
* The request will succeed if the passed in dbuf still represents the
|
|
|
|
* same os/object/blkid, is ineligible for eviction, and has at least
|
|
|
|
* one hold by a user other than the syncer.
|
|
|
|
*/
|
|
|
|
boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
|
2022-04-19 21:38:30 +03:00
|
|
|
uint64_t blkid, const void *tag);
|
2015-04-02 14:59:15 +03:00
|
|
|
|
2022-04-19 21:38:30 +03:00
|
|
|
void dmu_buf_rele(dmu_buf_t *db, const void *tag);
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t dmu_buf_refcount(dmu_buf_t *db);
|
2018-06-19 00:10:54 +03:00
|
|
|
uint64_t dmu_buf_user_refcount(dmu_buf_t *db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
|
|
|
|
* range of an object. A pointer to an array of dmu_buf_t*'s is
|
|
|
|
* returned (in *dbpp).
|
|
|
|
*
|
|
|
|
* dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
|
|
|
|
* frees the array. The hold on the array of buffers MUST be released
|
|
|
|
* with dmu_buf_rele_array. You can NOT release the hold on each buffer
|
|
|
|
* individually with dmu_buf_rele.
|
|
|
|
*/
|
|
|
|
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
|
2022-04-19 21:38:30 +03:00
|
|
|
uint64_t length, boolean_t read, const void *tag,
|
2015-12-27 00:10:31 +03:00
|
|
|
int *numbufsp, dmu_buf_t ***dbpp);
|
2022-04-19 21:38:30 +03:00
|
|
|
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, const void *tag);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2015-04-02 06:44:32 +03:00
|
|
|
typedef void dmu_buf_evict_func_t(void *user_ptr);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2015-04-02 06:44:32 +03:00
|
|
|
* A DMU buffer user object may be associated with a dbuf for the
|
|
|
|
* duration of its lifetime. This allows the user of a dbuf (client)
|
|
|
|
* to attach private data to a dbuf (e.g. in-core only data such as a
|
|
|
|
* dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
|
|
|
|
* when that dbuf has been evicted. Clients typically respond to the
|
|
|
|
* eviction notification by freeing their private data, thus ensuring
|
|
|
|
* the same lifetime for both dbuf and private data.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2015-04-02 06:44:32 +03:00
|
|
|
* The mapping from a dmu_buf_user_t to any client private data is the
|
|
|
|
* client's responsibility. All current consumers of the API with private
|
|
|
|
* data embed a dmu_buf_user_t as the first member of the structure for
|
|
|
|
* their private data. This allows conversions between the two types
|
|
|
|
* with a simple cast. Since the DMU buf user API never needs access
|
|
|
|
* to the private data, other strategies can be employed if necessary
|
|
|
|
* or convenient for the client (e.g. using container_of() to do the
|
|
|
|
* conversion for private data that cannot have the dmu_buf_user_t as
|
|
|
|
* its first member).
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2015-04-02 06:44:32 +03:00
|
|
|
* Eviction callbacks are executed without the dbuf mutex held or any
|
|
|
|
* other type of mechanism to guarantee that the dbuf is still available.
|
|
|
|
* For this reason, users must assume the dbuf has already been freed
|
|
|
|
* and not reference the dbuf from the callback context.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2015-04-02 06:44:32 +03:00
|
|
|
* Users requesting "immediate eviction" are notified as soon as the dbuf
|
|
|
|
* is only referenced by dirty records (dirties == holds). Otherwise the
|
|
|
|
* notification occurs after eviction processing for the dbuf begins.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2015-04-02 06:44:32 +03:00
|
|
|
typedef struct dmu_buf_user {
|
|
|
|
/*
|
|
|
|
* Asynchronous user eviction callback state.
|
|
|
|
*/
|
|
|
|
taskq_ent_t dbu_tqent;
|
|
|
|
|
2017-01-27 01:43:28 +03:00
|
|
|
/*
|
|
|
|
* This instance's eviction function pointers.
|
|
|
|
*
|
|
|
|
* dbu_evict_func_sync is called synchronously and then
|
|
|
|
* dbu_evict_func_async is executed asynchronously on a taskq.
|
|
|
|
*/
|
|
|
|
dmu_buf_evict_func_t *dbu_evict_func_sync;
|
|
|
|
dmu_buf_evict_func_t *dbu_evict_func_async;
|
2015-04-02 06:44:32 +03:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
/*
|
|
|
|
* Pointer to user's dbuf pointer. NULL for clients that do
|
|
|
|
* not associate a dbuf with their user data.
|
|
|
|
*
|
|
|
|
* The dbuf pointer is cleared upon eviction so as to catch
|
|
|
|
* use-after-evict bugs in clients.
|
|
|
|
*/
|
|
|
|
dmu_buf_t **dbu_clear_on_evict_dbufp;
|
|
|
|
#endif
|
|
|
|
} dmu_buf_user_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2015-04-02 06:44:32 +03:00
|
|
|
* Initialize the given dmu_buf_user_t instance with the eviction function
|
|
|
|
* evict_func, to be called when the user is evicted.
|
|
|
|
*
|
|
|
|
* NOTE: This function should only be called once on a given dmu_buf_user_t.
|
|
|
|
* To allow enforcement of this, dbu must already be zeroed on entry.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2015-04-02 06:44:32 +03:00
|
|
|
static inline void
|
2017-01-27 01:43:28 +03:00
|
|
|
dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
|
2020-07-24 03:41:48 +03:00
|
|
|
dmu_buf_evict_func_t *evict_func_async,
|
|
|
|
dmu_buf_t **clear_on_evict_dbufp __maybe_unused)
|
2015-04-02 06:44:32 +03:00
|
|
|
{
|
2017-01-27 01:43:28 +03:00
|
|
|
ASSERT(dbu->dbu_evict_func_sync == NULL);
|
|
|
|
ASSERT(dbu->dbu_evict_func_async == NULL);
|
|
|
|
|
|
|
|
/* must have at least one evict func */
|
|
|
|
IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
|
|
|
|
dbu->dbu_evict_func_sync = evict_func_sync;
|
|
|
|
dbu->dbu_evict_func_async = evict_func_async;
|
2015-05-16 18:40:45 +03:00
|
|
|
taskq_init_ent(&dbu->dbu_tqent);
|
2015-04-02 06:44:32 +03:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
|
|
|
|
#endif
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2015-04-02 06:44:32 +03:00
|
|
|
* Attach user data to a dbuf and mark it for normal (when the dbuf's
|
|
|
|
* data is cleared or its reference count goes to zero) eviction processing.
|
|
|
|
*
|
|
|
|
* Returns NULL on success, or the existing user if another user currently
|
|
|
|
* owns the buffer.
|
|
|
|
*/
|
|
|
|
void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Attach user data to a dbuf and mark it for immediate (its dirty and
|
|
|
|
* reference counts are equal) eviction processing.
|
|
|
|
*
|
|
|
|
* Returns NULL on success, or the existing user if another user currently
|
|
|
|
* owns the buffer.
|
|
|
|
*/
|
|
|
|
void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Replace the current user of a dbuf.
|
|
|
|
*
|
|
|
|
* If given the current user of a dbuf, replaces the dbuf's user with
|
|
|
|
* "new_user" and returns the user data pointer that was replaced.
|
|
|
|
* Otherwise returns the current, and unmodified, dbuf user pointer.
|
|
|
|
*/
|
|
|
|
void *dmu_buf_replace_user(dmu_buf_t *db,
|
|
|
|
dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove the specified user data for a DMU buffer.
|
|
|
|
*
|
|
|
|
* Returns the user that was removed on success, or the current user if
|
|
|
|
* another user currently owns the buffer.
|
|
|
|
*/
|
|
|
|
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
void *dmu_buf_get_user(dmu_buf_t *db);
|
|
|
|
|
2016-07-21 01:39:55 +03:00
|
|
|
objset_t *dmu_buf_get_objset(dmu_buf_t *db);
|
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the
same directory, on a machine with 16 CPU cores, I observed poor
performance. I noticed that dmu_tx_hold_zap() was using about 30% of
all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
object that is being held).
dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
dnode_t that we already have in hand, rather than repeatedly calling
dnode_hold(). To do this, we need to pass the dnode_t down through
all the intermediate calls that dmu_tx_hold_zap() makes, making these
routines take the dnode_t* rather than an objset_t* and a uint64_t
object number. In particular, the following routines will need to have
analogous *_by_dnode() variants created:
dmu_buf_hold_noread()
dmu_buf_hold()
zap_lookup()
zap_lookup_norm()
zap_count_write()
zap_lockdir()
zap_count_write()
This can improve performance on the benchmark described above by 100%,
from 30,000 file creations per second to 60,000. (This improvement is on
top of that provided by working around the object allocation issue. Peak
performance of ~90,000 creations per second was observed with 8 CPUs;
adding CPUs past that decreased performance due to lock contention.) The
CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
to 40 CPU-seconds.
Sponsored by: Intel Corp.
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7004
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109
Closes #4641
Closes #4972
2016-07-21 01:42:13 +03:00
|
|
|
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
|
|
|
|
void dmu_buf_dnode_exit(dmu_buf_t *db);
|
2016-07-21 01:39:55 +03:00
|
|
|
|
2015-04-02 06:44:32 +03:00
|
|
|
/* Block until any in-progress dmu buf user evictions complete. */
|
|
|
|
void dmu_buf_user_evict_wait(void);
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* Returns the blkptr associated with this dbuf, or NULL if not set.
|
|
|
|
*/
|
|
|
|
struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Indicate that you are going to modify the buffer's data (db_data).
|
|
|
|
*
|
|
|
|
* The transaction (tx) must be assigned to a txg (ie. you've called
|
|
|
|
* dmu_tx_assign()). The buffer's object must be held in the tx
|
|
|
|
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
|
|
|
|
*/
|
|
|
|
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
|
2019-03-06 20:50:55 +03:00
|
|
|
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
|
2018-04-17 21:06:54 +03:00
|
|
|
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
|
|
|
|
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* You must create a transaction, then hold the objects which you will
|
|
|
|
* (or might) modify as part of this transaction. Then you must assign
|
|
|
|
* the transaction to a transaction group. Once the transaction has
|
|
|
|
* been assigned, you can modify buffers which belong to held objects as
|
|
|
|
* part of this transaction. You can't modify buffers before the
|
|
|
|
* transaction has been assigned; you can't modify buffers which don't
|
|
|
|
* belong to objects which this transaction holds; you can't hold
|
|
|
|
* objects once the transaction has been assigned. You may hold an
|
|
|
|
* object which you are going to free (with dmu_object_free()), but you
|
|
|
|
* don't have to.
|
|
|
|
*
|
|
|
|
* You can abort the transaction before it has been assigned.
|
|
|
|
*
|
|
|
|
* Note that you may hold buffers (with dmu_buf_hold) at any time,
|
|
|
|
* regardless of transaction state.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define DMU_NEW_OBJECT (-1ULL)
|
|
|
|
#define DMU_OBJECT_END (-1ULL)
|
|
|
|
|
|
|
|
dmu_tx_t *dmu_tx_create(objset_t *os);
|
|
|
|
void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
|
2017-01-14 01:58:41 +03:00
|
|
|
void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
|
|
|
|
int len);
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
|
|
|
|
uint64_t len);
|
2017-01-14 01:58:41 +03:00
|
|
|
void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
|
|
|
|
uint64_t len);
|
2009-07-03 02:44:48 +04:00
|
|
|
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
|
2017-01-14 01:58:41 +03:00
|
|
|
void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
|
|
|
|
const char *name);
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
|
2017-01-14 01:58:41 +03:00
|
|
|
void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
|
2010-05-29 00:45:14 +04:00
|
|
|
void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
|
|
|
|
void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
|
|
|
|
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_tx_abort(dmu_tx_t *tx);
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_tx_wait(dmu_tx_t *tx);
|
|
|
|
void dmu_tx_commit(dmu_tx_t *tx);
|
2014-07-07 23:49:36 +04:00
|
|
|
void dmu_tx_mark_netfree(dmu_tx_t *tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* To register a commit callback, dmu_tx_callback_register() must be called.
|
|
|
|
*
|
|
|
|
* dcb_data is a pointer to caller private data that is passed on as a
|
|
|
|
* callback parameter. The caller is responsible for properly allocating and
|
|
|
|
* freeing it.
|
|
|
|
*
|
|
|
|
* When registering a callback, the transaction must be already created, but
|
|
|
|
* it cannot be committed or aborted. It can be assigned to a txg or not.
|
|
|
|
*
|
|
|
|
* The callback will be called after the transaction has been safely written
|
|
|
|
* to stable storage and will also be called if the dmu_tx is aborted.
|
|
|
|
* If there is any error which prevents the transaction from being committed to
|
|
|
|
* disk, the callback will be called with a value of error != 0.
|
2017-12-22 21:19:51 +03:00
|
|
|
*
|
|
|
|
* When multiple callbacks are registered to the transaction, the callbacks
|
|
|
|
* will be called in reverse order to let Lustre, the only user of commit
|
|
|
|
* callback currently, take the fast path of its commit callback handling.
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
|
|
|
typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
|
|
|
|
|
|
|
|
void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
|
|
|
|
void *dcb_data);
|
2017-12-22 21:19:51 +03:00
|
|
|
void dmu_tx_do_callbacks(list_t *cb_list, int error);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Free up the data blocks for a defined range of a file. If size is
|
2012-12-14 03:24:15 +04:00
|
|
|
* -1, the range from offset to end-of-file is freed.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
|
2017-09-28 18:49:13 +03:00
|
|
|
uint64_t size, dmu_tx_t *tx);
|
2008-12-03 23:09:06 +03:00
|
|
|
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
|
2017-09-28 18:49:13 +03:00
|
|
|
uint64_t size);
|
2013-08-21 08:11:52 +04:00
|
|
|
int dmu_free_long_object(objset_t *os, uint64_t object);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Convenience functions.
|
|
|
|
*
|
|
|
|
* Canfail routines will return 0 on success, or an errno if there is a
|
|
|
|
* nonrecoverable I/O error.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
#define DMU_READ_PREFETCH 0 /* prefetch */
|
|
|
|
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
|
2008-11-20 23:01:55 +03:00
|
|
|
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
2009-07-03 02:44:48 +04:00
|
|
|
void *buf, uint32_t flags);
|
2017-01-14 01:58:41 +03:00
|
|
|
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
|
|
|
|
uint32_t flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
|
|
|
const void *buf, dmu_tx_t *tx);
|
2017-01-14 01:58:41 +03:00
|
|
|
void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
|
|
|
const void *buf, dmu_tx_t *tx);
|
2008-12-03 23:09:06 +03:00
|
|
|
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
|
|
|
dmu_tx_t *tx);
|
2010-08-26 22:45:02 +04:00
|
|
|
#ifdef _KERNEL
|
2021-01-21 08:27:30 +03:00
|
|
|
int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size);
|
|
|
|
int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size);
|
|
|
|
int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size);
|
|
|
|
int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
|
2010-12-17 20:14:38 +03:00
|
|
|
dmu_tx_t *tx);
|
2021-01-21 08:27:30 +03:00
|
|
|
int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
|
2010-12-17 20:14:38 +03:00
|
|
|
dmu_tx_t *tx);
|
2021-01-21 08:27:30 +03:00
|
|
|
int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
|
2017-06-13 19:18:08 +03:00
|
|
|
dmu_tx_t *tx);
|
2010-12-17 20:14:38 +03:00
|
|
|
#endif
|
2009-07-03 02:44:48 +04:00
|
|
|
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
|
|
|
|
void dmu_return_arcbuf(struct arc_buf *buf);
|
2019-01-18 02:47:08 +03:00
|
|
|
int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
|
2017-09-28 18:49:13 +03:00
|
|
|
struct arc_buf *buf, dmu_tx_t *tx);
|
2019-01-18 02:47:08 +03:00
|
|
|
int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
|
2017-09-28 18:49:13 +03:00
|
|
|
struct arc_buf *buf, dmu_tx_t *tx);
|
|
|
|
#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf
|
2014-11-03 23:15:08 +03:00
|
|
|
extern int zfs_max_recordsize;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Asynchronously try to read in the data.
|
|
|
|
*/
|
2015-12-22 04:31:57 +03:00
|
|
|
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
|
|
|
uint64_t len, enum zio_priority pri);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef struct dmu_object_info {
|
2010-05-29 00:45:14 +04:00
|
|
|
/* All sizes are in bytes unless otherwise indicated. */
|
2008-11-20 23:01:55 +03:00
|
|
|
uint32_t doi_data_block_size;
|
|
|
|
uint32_t doi_metadata_block_size;
|
|
|
|
dmu_object_type_t doi_type;
|
|
|
|
dmu_object_type_t doi_bonus_type;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t doi_bonus_size;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
|
|
|
|
uint8_t doi_checksum;
|
|
|
|
uint8_t doi_compress;
|
2014-09-12 07:28:35 +04:00
|
|
|
uint8_t doi_nblkptr;
|
|
|
|
uint8_t doi_pad[4];
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
uint64_t doi_dnodesize;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
|
|
|
|
uint64_t doi_max_offset;
|
|
|
|
uint64_t doi_fill_count; /* number of non-empty blocks */
|
2008-11-20 23:01:55 +03:00
|
|
|
} dmu_object_info_t;
|
|
|
|
|
2013-02-15 08:37:43 +04:00
|
|
|
typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef struct dmu_object_type_info {
|
2012-12-14 03:24:15 +04:00
|
|
|
dmu_object_byteswap_t ot_byteswap;
|
2008-11-20 23:01:55 +03:00
|
|
|
boolean_t ot_metadata;
|
2018-07-10 20:49:50 +03:00
|
|
|
boolean_t ot_dbuf_metadata_cache;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
boolean_t ot_encrypt;
|
2022-04-19 21:38:30 +03:00
|
|
|
const char *ot_name;
|
2008-11-20 23:01:55 +03:00
|
|
|
} dmu_object_type_info_t;
|
|
|
|
|
2013-02-15 08:37:43 +04:00
|
|
|
typedef const struct dmu_object_byteswap_info {
|
|
|
|
arc_byteswap_func_t ob_func;
|
2022-04-19 21:38:30 +03:00
|
|
|
const char *ob_name;
|
2012-12-14 03:24:15 +04:00
|
|
|
} dmu_object_byteswap_info_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
|
2012-12-14 03:24:15 +04:00
|
|
|
extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get information on a DMU object.
|
|
|
|
*
|
|
|
|
* Return 0 on success or ENOENT if object is not allocated.
|
|
|
|
*
|
|
|
|
* If doi is NULL, just indicates whether the object exists.
|
|
|
|
*/
|
|
|
|
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
|
2013-10-03 04:11:19 +04:00
|
|
|
void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
|
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the
same directory, on a machine with 16 CPU cores, I observed poor
performance. I noticed that dmu_tx_hold_zap() was using about 30% of
all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
object that is being held).
dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
dnode_t that we already have in hand, rather than repeatedly calling
dnode_hold(). To do this, we need to pass the dnode_t down through
all the intermediate calls that dmu_tx_hold_zap() makes, making these
routines take the dnode_t* rather than an objset_t* and a uint64_t
object number. In particular, the following routines will need to have
analogous *_by_dnode() variants created:
dmu_buf_hold_noread()
dmu_buf_hold()
zap_lookup()
zap_lookup_norm()
zap_count_write()
zap_lockdir()
zap_count_write()
This can improve performance on the benchmark described above by 100%,
from 30,000 file creations per second to 60,000. (This improvement is on
top of that provided by working around the object allocation issue. Peak
performance of ~90,000 creations per second was observed with 8 CPUs;
adding CPUs past that decreased performance due to lock contention.) The
CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
to 40 CPU-seconds.
Sponsored by: Intel Corp.
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7004
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109
Closes #4641
Closes #4972
2016-07-21 01:42:13 +03:00
|
|
|
void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
|
2013-06-11 21:12:34 +04:00
|
|
|
/*
|
|
|
|
* Like dmu_object_info_from_db, but faster still when you only care about
|
2020-02-25 02:38:23 +03:00
|
|
|
* the size.
|
2013-06-11 21:12:34 +04:00
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
|
|
|
|
u_longlong_t *nblk512);
|
|
|
|
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef struct dmu_objset_stats {
|
|
|
|
uint64_t dds_num_clones; /* number of clones of this */
|
|
|
|
uint64_t dds_creation_txg;
|
|
|
|
uint64_t dds_guid;
|
|
|
|
dmu_objset_type_t dds_type;
|
|
|
|
uint8_t dds_is_snapshot;
|
|
|
|
uint8_t dds_inconsistent;
|
Implement Redacted Send/Receive
Redacted send/receive allows users to send subsets of their data to
a target system. One possible use case for this feature is to not
transmit sensitive information to a data warehousing, test/dev, or
analytics environment. Another is to save space by not replicating
unimportant data within a given dataset, for example in backup tools
like zrepl.
Redacted send/receive is a three-stage process. First, a clone (or
clones) is made of the snapshot to be sent to the target. In this
clone (or clones), all unnecessary or unwanted data is removed or
modified. This clone is then snapshotted to create the "redaction
snapshot" (or snapshots). Second, the new zfs redact command is used
to create a redaction bookmark. The redaction bookmark stores the
list of blocks in a snapshot that were modified by the redaction
snapshot(s). Finally, the redaction bookmark is passed as a parameter
to zfs send. When sending to the snapshot that was redacted, the
redaction bookmark is used to filter out blocks that contain sensitive
or unwanted information, and those blocks are not included in the send
stream. When sending from the redaction bookmark, the blocks it
contains are considered as candidate blocks in addition to those
blocks in the destination snapshot that were modified since the
creation_txg of the redaction bookmark. This step is necessary to
allow the target to rehydrate data in the case where some blocks are
accidentally or unnecessarily modified in the redaction snapshot.
The changes to bookmarks to enable fast space estimation involve
adding deadlists to bookmarks. There is also logic to manage the
life cycles of these deadlists.
The new size estimation process operates in cases where previously
an accurate estimate could not be provided. In those cases, a send
is performed where no data blocks are read, reducing the runtime
significantly and providing a byte-accurate size estimate.
Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Prashanth Sreenivasa <pks@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Chris Williamson <chris.williamson@delphix.com>
Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #7958
2019-06-19 19:48:13 +03:00
|
|
|
uint8_t dds_redacted;
|
2016-06-16 00:28:36 +03:00
|
|
|
char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
|
2008-11-20 23:01:55 +03:00
|
|
|
} dmu_objset_stats_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get stats on a dataset.
|
|
|
|
*/
|
|
|
|
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add entries to the nvlist for all the objset's properties. See
|
|
|
|
* zfs_prop_table[] and zfs(1m) for details on the properties.
|
|
|
|
*/
|
|
|
|
void dmu_objset_stats(objset_t *os, struct nvlist *nv);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the space usage statistics for statvfs().
|
|
|
|
*
|
|
|
|
* refdbytes is the amount of space "referenced" by this objset.
|
|
|
|
* availbytes is the amount of space available to this objset, taking
|
|
|
|
* into account quotas & reservations, assuming that no other objsets
|
|
|
|
* use the space first. These values correspond to the 'referenced' and
|
|
|
|
* 'available' properties, described in the zfs(1m) manpage.
|
|
|
|
*
|
|
|
|
* usedobjs and availobjs are the number of objects currently allocated,
|
|
|
|
* and available.
|
|
|
|
*/
|
|
|
|
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
|
|
|
|
uint64_t *usedobjsp, uint64_t *availobjsp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The fsid_guid is a 56-bit ID that can change to avoid collisions.
|
|
|
|
* (Contrast with the ds_guid which is a 64-bit ID that will never
|
|
|
|
* change, so there is a small probability that it will collide.)
|
|
|
|
*/
|
|
|
|
uint64_t dmu_objset_fsid_guid(objset_t *os);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Get the [cm]time for an objset's snapshot dir
|
|
|
|
*/
|
2018-06-20 07:51:18 +03:00
|
|
|
inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
int dmu_objset_is_snapshot(objset_t *os);
|
|
|
|
|
|
|
|
extern struct spa *dmu_objset_spa(objset_t *os);
|
|
|
|
extern struct zilog *dmu_objset_zil(objset_t *os);
|
|
|
|
extern struct dsl_pool *dmu_objset_pool(objset_t *os);
|
|
|
|
extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
|
|
|
|
extern void dmu_objset_name(objset_t *os, char *buf);
|
|
|
|
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
|
|
|
|
extern uint64_t dmu_objset_id(objset_t *os);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
extern uint64_t dmu_objset_dnodesize(objset_t *os);
|
2014-05-23 20:21:07 +04:00
|
|
|
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
|
|
|
|
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
|
2019-07-26 20:54:14 +03:00
|
|
|
extern int dmu_objset_blksize(objset_t *os);
|
2008-11-20 23:01:55 +03:00
|
|
|
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
|
|
|
|
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
|
2013-01-26 02:57:53 +04:00
|
|
|
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
|
2020-10-03 03:44:10 +03:00
|
|
|
extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real,
|
2008-11-20 23:01:55 +03:00
|
|
|
int maxlen, boolean_t *conflict);
|
|
|
|
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
|
|
|
|
uint64_t *idp, uint64_t *offp);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
File incorrectly zeroed when receiving incremental stream that toggles -L
Background:
By increasing the recordsize property above the default of 128KB, a
filesystem may have "large" blocks. By default, a send stream of such a
filesystem does not contain large WRITE records, instead it decreases
objects' block sizes to 128KB and splits the large blocks into 128KB
blocks, allowing the large-block filesystem to be received by a system
that does not support the `large_blocks` feature. A send stream
generated by `zfs send -L` (or `--large-block`) preserves the large
block size on the receiving system, by using large WRITE records.
When receiving an incremental send stream for a filesystem with large
blocks, if the send stream's -L flag was toggled, a bug is encountered
in which the file's contents are incorrectly zeroed out. The contents
of any blocks that were not modified by this send stream will be lost.
"Toggled" means that the previous send used `-L`, but this incremental
does not use `-L` (-L to no-L); or that the previous send did not use
`-L`, but this incremental does use `-L` (no-L to -L).
Changes:
This commit addresses the problem with several changes to the semantics
of zfs send/receive:
1. "-L to no-L" incrementals are rejected. If the previous send used
`-L`, but this incremental does not use `-L`, the `zfs receive` will
fail with this error message:
incremental send stream requires -L (--large-block), to match
previous receive.
2. "no-L to -L" incrementals are handled correctly, preserving the
smaller (128KB) block size of any already-received files that used large
blocks on the sending system but were split by `zfs send` without the
`-L` flag.
3. A new send stream format flag is added, `SWITCH_TO_LARGE_BLOCKS`.
This feature indicates that we can correctly handle "no-L to -L"
incrementals. This flag is currently not set on any send streams. In
the future, we intend for incremental send streams of snapshots that
have large blocks to use `-L` by default, and these streams will also
have the `SWITCH_TO_LARGE_BLOCKS` feature set. This ensures that streams
from the default use of `zfs send` won't encounter the bug mentioned
above, because they can't be received by software with the bug.
Implementation notes:
To facilitate accessing the ZPL's generation number,
`zfs_space_delta_cb()` has been renamed to `zpl_get_file_info()` and
restructured to fill in a struct with ZPL-specific info including owner
and generation.
In the "no-L to -L" case, if this is a compressed send stream (from
`zfs send -cL`), large WRITE records that are being written to small
(128KB) blocksize files need to be decompressed so that they can be
written split up into multiple blocks. The zio pipeline will recompress
each smaller block individually.
A new test case, `send-L_toggle`, is added, which tests the "no-L to -L"
case and verifies that we get an error for the "-L to no-L" case.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #6224
Closes #10383
2020-06-09 20:41:01 +03:00
|
|
|
typedef struct zfs_file_info {
|
|
|
|
uint64_t zfi_user;
|
|
|
|
uint64_t zfi_group;
|
|
|
|
uint64_t zfi_project;
|
|
|
|
uint64_t zfi_generation;
|
|
|
|
} zfs_file_info_t;
|
|
|
|
|
|
|
|
typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data,
|
|
|
|
struct zfs_file_info *zoi);
|
2009-07-03 02:44:48 +04:00
|
|
|
extern void dmu_objset_register_type(dmu_objset_type_t ost,
|
File incorrectly zeroed when receiving incremental stream that toggles -L
Background:
By increasing the recordsize property above the default of 128KB, a
filesystem may have "large" blocks. By default, a send stream of such a
filesystem does not contain large WRITE records, instead it decreases
objects' block sizes to 128KB and splits the large blocks into 128KB
blocks, allowing the large-block filesystem to be received by a system
that does not support the `large_blocks` feature. A send stream
generated by `zfs send -L` (or `--large-block`) preserves the large
block size on the receiving system, by using large WRITE records.
When receiving an incremental send stream for a filesystem with large
blocks, if the send stream's -L flag was toggled, a bug is encountered
in which the file's contents are incorrectly zeroed out. The contents
of any blocks that were not modified by this send stream will be lost.
"Toggled" means that the previous send used `-L`, but this incremental
does not use `-L` (-L to no-L); or that the previous send did not use
`-L`, but this incremental does use `-L` (no-L to -L).
Changes:
This commit addresses the problem with several changes to the semantics
of zfs send/receive:
1. "-L to no-L" incrementals are rejected. If the previous send used
`-L`, but this incremental does not use `-L`, the `zfs receive` will
fail with this error message:
incremental send stream requires -L (--large-block), to match
previous receive.
2. "no-L to -L" incrementals are handled correctly, preserving the
smaller (128KB) block size of any already-received files that used large
blocks on the sending system but were split by `zfs send` without the
`-L` flag.
3. A new send stream format flag is added, `SWITCH_TO_LARGE_BLOCKS`.
This feature indicates that we can correctly handle "no-L to -L"
incrementals. This flag is currently not set on any send streams. In
the future, we intend for incremental send streams of snapshots that
have large blocks to use `-L` by default, and these streams will also
have the `SWITCH_TO_LARGE_BLOCKS` feature set. This ensures that streams
from the default use of `zfs send` won't encounter the bug mentioned
above, because they can't be received by software with the bug.
Implementation notes:
To facilitate accessing the ZPL's generation number,
`zfs_space_delta_cb()` has been renamed to `zpl_get_file_info()` and
restructured to fill in a struct with ZPL-specific info including owner
and generation.
In the "no-L to -L" case, if this is a compressed send stream (from
`zfs send -cL`), large WRITE records that are being written to small
(128KB) blocksize files need to be decompressed so that they can be
written split up into multiple blocks. The zio pipeline will recompress
each smaller block individually.
A new test case, `send-L_toggle`, is added, which tests the "no-L to -L"
case and verifies that we get an error for the "-L to no-L" case.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #6224
Closes #10383
2020-06-09 20:41:01 +03:00
|
|
|
file_info_cb_t *cb);
|
2008-11-20 23:01:55 +03:00
|
|
|
extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
|
|
|
|
extern void *dmu_objset_get_user(objset_t *os);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the txg number for the given assigned transaction.
|
|
|
|
*/
|
|
|
|
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Synchronous write.
|
|
|
|
* If a parent zio is provided this function initiates a write on the
|
|
|
|
* provided buffer as a child of the parent zio.
|
|
|
|
* In the absence of a parent zio, the write is completed synchronously.
|
|
|
|
* At write completion, blk is filled with the bp of the written block.
|
|
|
|
* Note that while the data covered by this function will be on stable
|
|
|
|
* storage when the write completes this new data does not become a
|
|
|
|
* permanent part of the file until the associated transaction commits.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* {zfs,zvol,ztest}_get_done() args
|
|
|
|
*/
|
|
|
|
typedef struct zgd {
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
struct lwb *zgd_lwb;
|
2010-05-29 00:45:14 +04:00
|
|
|
struct blkptr *zgd_bp;
|
|
|
|
dmu_buf_t *zgd_db;
|
2019-11-01 20:37:33 +03:00
|
|
|
struct zfs_locked_range *zgd_lr;
|
2010-05-29 00:45:14 +04:00
|
|
|
void *zgd_private;
|
|
|
|
} zgd_t;
|
|
|
|
|
|
|
|
typedef void dmu_sync_cb_t(zgd_t *arg, int error);
|
|
|
|
int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the next hole or data block in file starting at *off
|
|
|
|
* Return found offset in *off. Return ESRCH for end of file.
|
|
|
|
*/
|
|
|
|
int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
|
|
|
|
uint64_t *off);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initial setup and final teardown.
|
|
|
|
*/
|
|
|
|
extern void dmu_init(void);
|
|
|
|
extern void dmu_fini(void);
|
|
|
|
|
|
|
|
typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
|
|
|
|
uint64_t object, uint64_t offset, int len);
|
|
|
|
void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
|
|
|
|
dmu_traverse_cb_t cb, void *arg);
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
|
2019-11-21 20:32:57 +03:00
|
|
|
zfs_file_t *fp, offset_t *offp);
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/* CRC64 table */
|
|
|
|
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
|
|
|
|
extern uint64_t zfs_crc64_table[256];
|
|
|
|
|
Improve log spacemap load time
Previous flushing algorithm limited only total number of log blocks to
the minimum of 256K and 4x number of metaslabs in the pool. As result,
system with 1500 disks with 1000 metaslabs each, touching several new
metaslabs each TXG could grow spacemap log to huge size without much
benefits. We've observed one of such systems importing pool for about
45 minutes.
This patch improves the situation from five sides:
- By limiting maximum period for each metaslab to be flushed to 1000
TXGs, that effectively limits maximum number of per-TXG spacemap logs
to load to the same number.
- By making flushing more smooth via accounting number of metaslabs
that were touched after the last flush and actually need another flush,
not just ms_unflushed_txg bump.
- By applying zfs_unflushed_log_block_pct to the number of metaslabs
that were touched after the last flush, not all metaslabs in the pool.
- By aggressively prefetching per-TXG spacemap logs up to 16 TXGs in
advance, making log spacemap load process for wide HDD pool CPU-bound,
accelerating it by many times.
- By reducing zfs_unflushed_log_block_max from 256K to 128K, reducing
single-threaded by nature log processing time from ~10 to ~5 minutes.
As further optimization we could skip bumping ms_unflushed_txg for
metaslabs not touched since the last flush, but that would be an
incompatible change, requiring new pool feature.
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12789
2022-04-26 20:44:21 +03:00
|
|
|
extern int dmu_prefetch_max;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_DMU_H */
|