Native Encryption for ZFS on Linux

This change incorporates three major pieces:

The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.

The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.

The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494 
Closes #5769
This commit is contained in:
Tom Caputi
2017-08-14 13:36:48 -04:00
committed by Brian Behlendorf
parent 376994828f
commit b525630342
163 changed files with 16091 additions and 1204 deletions
+6 -6
View File
@@ -52,7 +52,7 @@
static void Encode(uint8_t *, uint32_t *, size_t);
static void Encode64(uint8_t *, uint64_t *, size_t);
#if defined(__amd64) && defined(_KERNEL)
#if defined(__amd64)
#define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1)
#define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1)
@@ -62,7 +62,7 @@ void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
#else
static void SHA256Transform(SHA2_CTX *, const uint8_t *);
static void SHA512Transform(SHA2_CTX *, const uint8_t *);
#endif /* __amd64 && _KERNEL */
#endif /* __amd64 */
static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
@@ -142,7 +142,7 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
#endif /* _BIG_ENDIAN */
#if !defined(__amd64) || !defined(_KERNEL)
#if !defined(__amd64)
/* SHA256 Transform */
static void
@@ -600,7 +600,7 @@ SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk)
ctx->state.s64[7] += h;
}
#endif /* !__amd64 || !_KERNEL */
#endif /* !__amd64 */
/*
@@ -838,7 +838,7 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
i = buf_len;
}
#if !defined(__amd64) || !defined(_KERNEL)
#if !defined(__amd64)
if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
for (; i + buf_limit - 1 < input_len; i += buf_limit) {
SHA256Transform(ctx, &input[i]);
@@ -866,7 +866,7 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
i += block_count << 7;
}
}
#endif /* !__amd64 || !_KERNEL */
#endif /* !__amd64 */
/*
* general optimization:
+2 -2
View File
@@ -61,7 +61,7 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd,
offset -= uiop->uio_iov[vec_idx++].iov_len)
;
if (vec_idx == uiop->uio_iovcnt) {
if (vec_idx == uiop->uio_iovcnt && length > 0) {
/*
* The caller specified an offset that is larger than
* the total size of the buffers it provided.
@@ -192,7 +192,7 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output,
offset >= uiop->uio_iov[vec_idx].iov_len;
offset -= uiop->uio_iov[vec_idx++].iov_len)
;
if (vec_idx == uiop->uio_iovcnt) {
if (vec_idx == uiop->uio_iovcnt && length > 0) {
/*
* The caller specified an offset that is larger than the
* total size of the buffers it provided.
+1 -1
View File
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
* Copyright (c) 2016, Datto, Inc. All rights reserved.
* Copyright (c) 2017, Datto, Inc. All rights reserved.
*/
#ifdef _KERNEL
+11
View File
@@ -318,6 +318,17 @@ zpool_feature_init(void)
ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
userobj_accounting_deps);
}
{
static const spa_feature_t encryption_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_ENCRYPTION,
"com.datto:encryption", "encryption",
"Support for dataset level encryption",
ZFEATURE_FLAG_PER_DATASET, encryption_deps);
}
}
#if defined(_KERNEL) && defined(HAVE_SPL)
+2
View File
@@ -69,6 +69,8 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
{ZFS_DELEG_PERM_GROUPOBJUSED},
{ZFS_DELEG_PERM_HOLD},
{ZFS_DELEG_PERM_RELEASE},
{ZFS_DELEG_PERM_LOAD_KEY},
{ZFS_DELEG_PERM_CHANGE_KEY},
{NULL}
};
+96 -3
View File
@@ -33,6 +33,7 @@
#include <sys/zfs_acl.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_znode.h>
#include <sys/dsl_crypt.h>
#include "zfs_prop.h"
#include "zfs_deleg.h"
@@ -119,6 +120,26 @@ zfs_prop_init(void)
{ NULL }
};
static zprop_index_t crypto_table[] = {
{ "on", ZIO_CRYPT_ON },
{ "off", ZIO_CRYPT_OFF },
{ "aes-128-ccm", ZIO_CRYPT_AES_128_CCM },
{ "aes-192-ccm", ZIO_CRYPT_AES_192_CCM },
{ "aes-256-ccm", ZIO_CRYPT_AES_256_CCM },
{ "aes-128-gcm", ZIO_CRYPT_AES_128_GCM },
{ "aes-192-gcm", ZIO_CRYPT_AES_192_GCM },
{ "aes-256-gcm", ZIO_CRYPT_AES_256_GCM },
{ NULL }
};
static zprop_index_t keyformat_table[] = {
{ "none", ZFS_KEYFORMAT_NONE },
{ "raw", ZFS_KEYFORMAT_RAW },
{ "hex", ZFS_KEYFORMAT_HEX },
{ "passphrase", ZFS_KEYFORMAT_PASSPHRASE },
{ NULL }
};
static zprop_index_t snapdir_table[] = {
{ "hidden", ZFS_SNAPDIR_HIDDEN },
{ "visible", ZFS_SNAPDIR_VISIBLE },
@@ -193,6 +214,13 @@ zfs_prop_init(void)
{ NULL }
};
static zprop_index_t keystatus_table[] = {
{ "none", ZFS_KEYSTATUS_NONE},
{ "unavailable", ZFS_KEYSTATUS_UNAVAILABLE},
{ "available", ZFS_KEYSTATUS_AVAILABLE},
{ NULL }
};
static zprop_index_t logbias_table[] = {
{ "latency", ZFS_LOGBIAS_LATENCY },
{ "throughput", ZFS_LOGBIAS_THROUGHPUT },
@@ -351,12 +379,16 @@ zfs_prop_init(void)
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
"CANMOUNT", canmount_table);
/* readonly index (boolean) properties */
/* readonly index properties */
zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
boolean_table);
zprop_register_index(ZFS_PROP_KEYSTATUS, "keystatus",
ZFS_KEYSTATUS_NONE, PROP_READONLY, ZFS_TYPE_DATASET,
"none | unavailable | available",
"KEYSTATUS", keystatus_table);
/* set once index properties */
zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
@@ -367,6 +399,15 @@ zfs_prop_init(void)
ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_SNAPSHOT,
"sensitive | insensitive | mixed", "CASE", case_table);
zprop_register_index(ZFS_PROP_KEYFORMAT, "keyformat",
ZFS_KEYFORMAT_NONE, PROP_ONETIME_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"none | raw | hex | passphrase", "KEYFORMAT", keyformat_table);
zprop_register_index(ZFS_PROP_ENCRYPTION, "encryption",
ZIO_CRYPT_DEFAULT, PROP_ONETIME, ZFS_TYPE_DATASET,
"on | off | aes-128-ccm | aes-192-ccm | aes-256-ccm | "
"aes-128-gcm | aes-192-gcm | aes-256-gcm", "ENCRYPTION",
crypto_table);
/* set once index (boolean) properties */
zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
@@ -409,6 +450,12 @@ zfs_prop_init(void)
"receive_resume_token",
NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<string token>", "RESUMETOK");
zprop_register_string(ZFS_PROP_ENCRYPTION_ROOT, "encryptionroot", NULL,
PROP_READONLY, ZFS_TYPE_DATASET, "<filesystem | volume>",
"ENCROOT");
zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation",
"none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"prompt | <file URI>", "KEYLOCATION");
/* readonly number properties */
zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
@@ -456,6 +503,9 @@ zfs_prop_init(void)
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters",
0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<iters>", "PBKDF2ITERS");
/* default number properties */
zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
@@ -503,6 +553,11 @@ zfs_prop_init(void)
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
zprop_register_hidden(ZFS_PROP_PBKDF2_SALT, "pbkdf2salt",
PROP_TYPE_NUMBER, PROP_ONETIME_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT");
zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID");
/*
* Property to be removed once libbe is integrated
@@ -650,7 +705,8 @@ boolean_t
zfs_prop_readonly(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME);
zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
}
/*
@@ -659,7 +715,8 @@ zfs_prop_readonly(zfs_prop_t prop)
boolean_t
zfs_prop_setonce(zfs_prop_t prop)
{
return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
return (zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
}
const char *
@@ -694,6 +751,40 @@ zfs_prop_inheritable(zfs_prop_t prop)
zfs_prop_table[prop].pd_attr == PROP_ONETIME);
}
/*
* Returns TRUE if property is one of the encryption properties that requires
* a loaded encryption key to modify.
*/
boolean_t
zfs_prop_encryption_key_param(zfs_prop_t prop)
{
/*
* keylocation does not count as an encryption property. It can be
* changed at will without needing the master keys.
*/
return (prop == ZFS_PROP_PBKDF2_SALT || prop == ZFS_PROP_PBKDF2_ITERS ||
prop == ZFS_PROP_KEYFORMAT);
}
/*
* Helper function used by both kernelspace and userspace to check the
* keylocation property. If encrypted is set, the keylocation must be valid
* for an encrypted dataset.
*/
boolean_t
zfs_prop_valid_keylocation(const char *str, boolean_t encrypted)
{
if (strcmp("none", str) == 0)
return (!encrypted);
else if (strcmp("prompt", str) == 0)
return (B_TRUE);
else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0)
return (B_TRUE);
return (B_FALSE);
}
#ifndef _KERNEL
/*
@@ -774,6 +865,8 @@ EXPORT_SYMBOL(zfs_prop_default_string);
EXPORT_SYMBOL(zfs_prop_default_numeric);
EXPORT_SYMBOL(zfs_prop_readonly);
EXPORT_SYMBOL(zfs_prop_inheritable);
EXPORT_SYMBOL(zfs_prop_encryption_key_param);
EXPORT_SYMBOL(zfs_prop_valid_keylocation);
EXPORT_SYMBOL(zfs_prop_setonce);
EXPORT_SYMBOL(zfs_prop_to_name);
EXPORT_SYMBOL(zfs_name_to_prop);
+2
View File
@@ -33,6 +33,7 @@ $(MODULE)-objs += dsl_deadlist.o
$(MODULE)-objs += dsl_deleg.o
$(MODULE)-objs += dsl_bookmark.o
$(MODULE)-objs += dsl_dir.o
$(MODULE)-objs += dsl_crypt.o
$(MODULE)-objs += dsl_pool.o
$(MODULE)-objs += dsl_prop.o
$(MODULE)-objs += dsl_scan.o
@@ -103,6 +104,7 @@ $(MODULE)-objs += zil.o
$(MODULE)-objs += zio.o
$(MODULE)-objs += zio_checksum.o
$(MODULE)-objs += zio_compress.o
$(MODULE)-objs += zio_crypt.o
$(MODULE)-objs += zio_inject.o
$(MODULE)-objs += zle.o
$(MODULE)-objs += zpl_ctldir.o
+1352 -259
View File
File diff suppressed because it is too large Load Diff
+2 -1
View File
@@ -212,7 +212,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
err = 0;
for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
TRAVERSE_NO_DECRYPT;
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
+173 -36
View File
@@ -964,7 +964,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
}
static void
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
@@ -984,7 +984,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else if (zio == NULL || zio->io_error == 0) {
} else if (err == 0) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
@@ -1003,7 +1003,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err;
int err, zio_flags = 0;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -1021,6 +1021,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ?
dn->dn_dbuf->db_buf : NULL;
/* if the underlying dnode block is encrypted, decrypt it */
if (dn_buf != NULL && dn->dn_objset->os_encrypted &&
DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) &&
(flags & DB_RF_NO_DECRYPT) == 0 &&
arc_is_encrypted(dn_buf)) {
err = arc_untransform(dn_buf, dn->dn_objset->os_spa,
dmu_objset_id(dn->dn_objset), B_TRUE);
if (err != 0) {
DB_DNODE_EXIT(db);
mutex_exit(&db->db_mtx);
return (err);
}
}
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
@@ -1088,11 +1104,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
/*
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
spa_log_error(db->db_objset->os_spa, &zb);
zfs_panic_recover("unencrypted block in encrypted "
"object set %llu", dmu_objset_id(db->db_objset));
return (SET_ERROR(EIO));
}
dbuf_add_ref(db, NULL);
zio_flags = (flags & DB_RF_CANFAIL) ?
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
@@ -1141,18 +1173,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
enum zio_compress compress_type =
arc_get_compression(db->db_buf);
if (compress_type == ZIO_COMPRESS_OFF) {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
} else {
if (arc_is_encrypted(db->db_buf)) {
boolean_t byteorder;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];
arc_get_raw_params(db->db_buf, &byteorder, salt,
iv, mac);
dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
compress_type);
} else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
size, arc_buf_lsize(db->db_buf), compress_type);
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
@@ -1188,16 +1233,21 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
spa_t *spa = dn->dn_objset->os_spa;
/*
* If the arc buf is compressed, we need to decompress it to
* read the data. This could happen during the "zfs receive" of
* a stream which is compressed and deduplicated.
* If the arc buf is compressed or encrypted, we need to
* untransform it to read the data. This could happen during
* the "zfs receive" of a stream which is deduplicated and
* either raw or compressed. We do not need to do this if the
* caller wants raw encrypted data.
*/
if (db->db_buf != NULL &&
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
dbuf_fix_old_data(db,
spa_syncing_txg(dmu_objset_spa(db->db_objset)));
err = arc_decompress(db->db_buf);
if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 &&
(arc_is_encrypted(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
dbuf_fix_old_data(db, spa_syncing_txg(spa));
err = arc_untransform(db->db_buf, spa,
dmu_objset_id(db->db_objset), B_FALSE);
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
@@ -1316,6 +1366,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
dr->dt.dl.dr_raw = B_FALSE;
/*
* Release the already-written buffer, so we leave it in
@@ -1908,11 +1959,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
static void
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
dbuf_dirty_record_t *dr;
ASSERT(tx->tx_txg != 0);
@@ -1944,12 +1994,19 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
(void) dbuf_read(db, NULL, rf);
(void) dbuf_read(db, NULL, flags);
(void) dbuf_dirty(db, tx);
}
void
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
}
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
@@ -1977,6 +2034,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
}
/*
* This function is effectively the same as dmu_buf_will_dirty(), but
* indicates the caller expects raw encrypted data in the db. It will
* also set the raw flag on the created dirty record.
*/
void
dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dbuf_dirty_record_t *dr;
dmu_buf_will_dirty_impl(db_fake,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
dr = db->db_last_dirty;
while (dr != NULL && dr->dr_txg > tx->tx_txg)
dr = dr->dr_next;
ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dr->dt.dl.dr_raw = B_TRUE;
}
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/* ARGSUSED */
void
@@ -2117,10 +2197,11 @@ dbuf_destroy(dmu_buf_impl_t *db)
if (db->db_blkid == DMU_BONUS_BLKID) {
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
ASSERT(db->db.db_data != NULL);
kmem_free(db->db.db_data, bonuslen);
arc_space_return(bonuslen, ARC_SPACE_BONUS);
db->db_state = DB_UNCACHED;
if (db->db.db_data != NULL) {
kmem_free(db->db.db_data, bonuslen);
arc_space_return(bonuslen, ARC_SPACE_BONUS);
db->db_state = DB_UNCACHED;
}
}
dbuf_clear_data(db);
@@ -2416,7 +2497,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
uint64_t nextblkid;
@@ -2438,7 +2519,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
if (zio->io_flags & ZIO_FLAG_RAW) {
if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
@@ -2463,7 +2544,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
if (BP_IS_HOLE(bp) || err != 0) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
@@ -2491,7 +2572,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
* complete.
* complete. Note that the prefetch might fail if the dataset is encrypted and
* the encryption key is unmapped before the IO completes.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
@@ -3120,6 +3202,41 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
}
}
/*
* Ensure the dbuf's data is untransformed if the associated dirty
* record requires it. This is used by dbuf_sync_leaf() to ensure
* that a dnode block is decrypted before we write new data to it.
* For raw writes we assert that the buffer is already encrypted.
*/
static void
dbuf_check_crypt(dbuf_dirty_record_t *dr)
{
int err;
dmu_buf_impl_t *db = dr->dr_dbuf;
ASSERT(MUTEX_HELD(&db->db_mtx));
if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) {
/*
* Unfortunately, there is currently no mechanism for
* syncing context to handle decryption errors. An error
* here is only possible if an attacker maliciously
* changed a dnode block and updated the associated
* checksums going up the block tree.
*/
err = arc_untransform(db->db_buf, db->db_objset->os_spa,
dmu_objset_id(db->db_objset), B_TRUE);
if (err)
panic("Invalid dnode block MAC");
} else if (dr->dt.dl.dr_raw) {
/*
* Writing raw encrypted data requires the db's arc buffer
* to be converted to raw by the caller.
*/
ASSERT(arc_is_encrypted(db->db_buf));
}
}
/*
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
* is critical the we not allow the compiler to inline this function in to
@@ -3241,9 +3358,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
bcopy(*datap, DN_BONUS(dn->dn_phys),
DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
@@ -3290,6 +3408,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
/*
* If this is a dnode block, ensure it is appropriately encrypted
* or decrypted, depending on what we are writing to it this txg.
*/
if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
dbuf_check_crypt(dr);
if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
@@ -3307,16 +3432,27 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* DNONE_DNODE blocks).
*/
int psize = arc_buf_size(*datap);
int lsize = arc_buf_lsize(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
enum zio_compress compress_type = arc_get_compression(*datap);
if (compress_type == ZIO_COMPRESS_OFF) {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
} else {
if (arc_is_encrypted(*datap)) {
boolean_t byteorder;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];
arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
*datap = arc_alloc_raw_buf(os->os_spa, db,
dmu_objset_id(os), byteorder, salt, iv, mac,
dn->dn_type, psize, lsize, compress_type);
} else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
int lsize = arc_buf_lsize(*datap);
*datap = arc_alloc_compressed_buf(os->os_spa, db,
psize, lsize, compress_type);
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
bcopy(db->db.db_data, (*datap)->b_data, psize);
}
@@ -3453,7 +3589,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
if (!BP_IS_EMBEDDED(bp))
bp->blk_fill = fill;
BP_SET_FILL(bp, fill);
mutex_exit(&db->db_mtx);
@@ -3778,7 +3914,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
arc_done_func_t *children_ready_cb = NULL;
arc_write_done_func_t *children_ready_cb = NULL;
ASSERT(arc_released(data));
/*
@@ -3810,6 +3946,7 @@ EXPORT_SYMBOL(dbuf_free_range);
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_will_change_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
+19 -4
View File
@@ -269,6 +269,10 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
}
/*
* The bp created via this function may be used for repairs and scrub, but it
* will be missing the salt / IV required to do a full decrypting read.
*/
void
ddt_bp_create(enum zio_checksum checksum,
const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
@@ -279,11 +283,12 @@ ddt_bp_create(enum zio_checksum checksum,
ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
bp->blk_cksum = ddk->ddk_cksum;
bp->blk_fill = 1;
BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
BP_SET_FILL(bp, 1);
BP_SET_CHECKSUM(bp, checksum);
BP_SET_TYPE(bp, DMU_OT_DEDUP);
BP_SET_LEVEL(bp, 0);
@@ -297,9 +302,12 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
ddk->ddk_cksum = bp->blk_cksum;
ddk->ddk_prop = 0;
ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
}
void
@@ -389,7 +397,7 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
if (ddp->ddp_phys_birth == 0)
continue;
for (d = 0; d < SPA_DVAS_PER_BP; d++)
for (d = 0; d < DDE_GET_NDVAS(dde); d++)
dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
dds->dds_blocks += 1;
@@ -562,6 +570,7 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
uint64_t ditto = spa->spa_dedup_ditto;
int total_copies = 0;
int desired_copies = 0;
int copies_needed = 0;
int p;
for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
@@ -588,7 +597,13 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
if (total_refcnt >= ditto * ditto)
desired_copies++;
return (MAX(desired_copies, total_copies) - total_copies);
copies_needed = MAX(desired_copies, total_copies) - total_copies;
/* encrypted blocks store their IV in DVA[2] */
if (DDK_GET_CRYPT(&dde->dde_key))
copies_needed = MIN(copies_needed, SPA_DVAS_PER_BP - 1);
return (copies_needed);
}
int
@@ -599,7 +614,7 @@ ddt_ditto_copies_present(ddt_entry_t *dde)
int copies = 0 - DVA_GET_GANG(dva);
int d;
for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
for (d = 0; d < DDE_GET_NDVAS(dde); d++, dva++)
if (DVA_IS_VALID(dva))
copies++;
+206 -62
View File
@@ -73,60 +73,60 @@ unsigned long zfs_per_txg_dirty_frees_percent = 30;
int zfs_dmu_offset_next_sync = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
{ DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
{ DMU_BSWAP_ZAP, TRUE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
{ DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
{ DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
{ DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
{ DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, TRUE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, FALSE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL directory child map"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, TRUE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, TRUE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, TRUE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset next clones"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group used" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group quota" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "snapshot refcount tags"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
@@ -198,6 +198,8 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
if (err == 0) {
@@ -221,6 +223,8 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
@@ -321,11 +325,18 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* returns ENOENT, EIO, or 0.
*/
int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
dmu_buf_t **dbp)
{
dnode_t *dn;
dmu_buf_impl_t *db;
int error;
uint32_t db_flags = DB_RF_MUST_SUCCEED;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
if (flags & DMU_READ_NO_DECRYPT)
db_flags |= DB_RF_NO_DECRYPT;
error = dnode_hold(os, object, FTAG, &dn);
if (error)
@@ -355,12 +366,24 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
error = dbuf_read(db, NULL, db_flags);
if (error) {
dnode_evict_bonus(dn);
dbuf_rele(db, tag);
*dbp = NULL;
return (error);
}
*dbp = &db->db;
return (0);
}
int
dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
{
return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
}
/*
* returns ENOENT, EIO, or 0.
*
@@ -601,8 +624,8 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
*
* Note that if the indirect blocks above the blocks being prefetched are not in
* cache, they will be asychronously read in.
* Note that if the indirect blocks above the blocks being prefetched are not
* in cache, they will be asychronously read in.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
@@ -1462,6 +1485,83 @@ dmu_return_arcbuf(arc_buf_t *buf)
arc_buf_destroy(buf, FTAG);
}
void
dmu_assign_arcbuf_impl(dmu_buf_t *handle, arc_buf_t *buf, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
dbuf_assign_arcbuf(db, buf, tx);
}
void
dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
{
dmu_object_type_t type;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
uint64_t dsobj = dmu_objset_id(db->db_objset);
ASSERT3P(db->db_buf, !=, NULL);
ASSERT3U(dsobj, !=, 0);
dmu_buf_will_change_crypt_params(handle, tx);
DB_DNODE_ENTER(db);
type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);
/*
* This technically violates the assumption the dmu code makes
* that dnode blocks are only released in syncing context.
*/
(void) arc_release(db->db_buf, db);
arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
}
void
dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
dmu_buf_t *handle, dmu_tx_t *tx)
{
dmu_buf_t *dst_handle;
dmu_buf_impl_t *dstdb;
dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle;
arc_buf_t *abuf;
uint64_t datalen;
boolean_t byteorder;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];
ASSERT3P(srcdb->db_buf, !=, NULL);
/* hold the db that we want to write to */
VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle,
DMU_READ_NO_DECRYPT));
dstdb = (dmu_buf_impl_t *)dst_handle;
datalen = arc_buf_size(srcdb->db_buf);
/* allocated an arc buffer that matches the type of srcdb->db_buf */
if (arc_is_encrypted(srcdb->db_buf)) {
arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac);
abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os),
byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type,
datalen, arc_buf_lsize(srcdb->db_buf),
arc_get_compression(srcdb->db_buf));
} else {
/* we won't get a compressed db back from dmu_buf_hold() */
ASSERT3U(arc_get_compression(srcdb->db_buf),
==, ZIO_COMPRESS_OFF);
abuf = arc_loan_buf(os->os_spa,
DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen);
}
ASSERT3U(datalen, ==, arc_buf_size(abuf));
/* copy the data to the new buffer and assign it to the dstdb */
bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen);
dbuf_assign_arcbuf(dstdb, abuf, tx);
dmu_buf_rele(dst_handle, FTAG);
}
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
@@ -1537,7 +1637,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
BP_SET_LSIZE(bp, db->db_size);
} else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
BP_SET_FILL(bp, 1);
}
}
}
@@ -1842,6 +1942,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
return (0);
}
int
dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
{
dnode_t *dn;
int err;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_set_nlevels(dn, nlevels, tx);
dnode_rele(dn, FTAG);
return (err);
}
int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
@@ -1916,6 +2030,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup = B_FALSE;
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
boolean_t encrypt = B_FALSE;
int copies = os->os_copies;
/*
@@ -2003,16 +2118,44 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
zp->zp_checksum = checksum;
zp->zp_compress = compress;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
/*
* All objects in an encrypted objset are protected from modification
* via a MAC. Encrypted objects store their IV and salt in the last DVA
* in the bp, so we cannot use all copies. Encrypted objects are also
* not subject to nopwrite since writing the same data will still
* result in a new ciphertext. Only encrypted blocks can be dedup'd
* to avoid ambiguity in the dedup code since the DDT does not store
* object types.
*/
if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
encrypt = B_TRUE;
if (DMU_OT_IS_ENCRYPTED(type)) {
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
nopwrite = B_FALSE;
} else {
dedup = B_FALSE;
}
if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
compress = ZIO_COMPRESS_EMPTY;
}
zp->zp_compress = compress;
zp->zp_checksum = checksum;
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}
/*
@@ -2267,6 +2410,7 @@ EXPORT_SYMBOL(dmu_object_info_from_dnode);
EXPORT_SYMBOL(dmu_object_info_from_db);
EXPORT_SYMBOL(dmu_object_size_from_db);
EXPORT_SYMBOL(dmu_object_dnsize_from_db);
EXPORT_SYMBOL(dmu_object_set_nlevels);
EXPORT_SYMBOL(dmu_object_set_blocksize);
EXPORT_SYMBOL(dmu_object_set_checksum);
EXPORT_SYMBOL(dmu_object_set_compress);
+239 -57
View File
@@ -56,6 +56,7 @@
#include <sys/vdev.h>
#include <sys/policy.h>
#include <sys/spa_impl.h>
#include <sys/dmu_send.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
@@ -391,16 +392,23 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (!BP_IS_HOLE(os->os_rootbp)) {
arc_flags_t aflags = ARC_FLAG_WAIT;
zbookmark_phys_t zb;
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_FLAG_L2CACHE;
if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
ASSERT(BP_IS_AUTHENTICATED(bp));
zio_flags |= ZIO_FLAG_RAW;
}
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,
arc_getbuf_func, &os->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (err != 0) {
kmem_free(os, sizeof (objset_t));
/* convert checksum errors into IO errors */
@@ -441,6 +449,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (ds != NULL) {
boolean_t needlock = B_FALSE;
os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
/*
* Note: it's valid to open the objset if the dataset is
* long-held, in which case the pool_config lock will not
@@ -450,6 +460,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
needlock = B_TRUE;
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
}
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@@ -517,6 +528,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_ON;
os->os_encrypted = B_FALSE;
os->os_copies = spa_max_replication(spa);
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
os->os_dedup_verify = B_FALSE;
@@ -603,16 +615,18 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
* can be held at a time.
*/
int
dmu_objset_hold(const char *name, void *tag, objset_t **osp)
dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
err = dsl_pool_hold(name, tag, &dp);
if (err != 0)
return (err);
err = dsl_dataset_hold(dp, name, tag, &ds);
err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, tag);
return (err);
@@ -627,23 +641,38 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
return (err);
}
int
dmu_objset_hold(const char *name, void *tag, objset_t **osp)
{
return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
}
static int
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
{
int err;
err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
dsl_dataset_disown(ds, tag);
return (err);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
/* if we are decrypting, we can now check MACs in os->os_phys_buf */
if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
ds->ds_object, B_FALSE);
if (err != 0)
return (err);
ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
}
return (0);
}
/*
@@ -653,49 +682,71 @@ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
*/
int
dmu_objset_own(const char *name, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
err = dsl_pool_hold(name, FTAG, &dp);
if (err != 0)
return (err);
err = dsl_dataset_own(dp, name, tag, &ds);
err = dsl_dataset_own(dp, name, flags, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
if (err != 0) {
dsl_dataset_disown(ds, flags, tag);
dsl_pool_rele(dp, FTAG);
return (err);
}
dsl_pool_rele(dp, FTAG);
if (err == 0 && dmu_objset_userobjspace_upgradable(*osp))
if (dmu_objset_userobjspace_upgradable(*osp))
dmu_objset_userobjspace_upgrade(*osp);
return (err);
return (0);
}
int
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
{
dsl_dataset_t *ds;
int err;
ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
err = dsl_dataset_own_obj(dp, obj, tag, &ds);
err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
if (err != 0)
return (err);
return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
if (err != 0) {
dsl_dataset_disown(ds, flags, tag);
return (err);
}
return (0);
}
void
dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
{
ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
dsl_pool_t *dp = dmu_objset_pool(os);
dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
dsl_pool_rele(dp, tag);
}
void
dmu_objset_rele(objset_t *os, void *tag)
{
dsl_pool_t *dp = dmu_objset_pool(os);
dsl_dataset_rele(os->os_dsl_dataset, tag);
dsl_pool_rele(dp, tag);
dmu_objset_rele_flags(os, B_FALSE, tag);
}
/*
@@ -710,7 +761,7 @@ dmu_objset_rele(objset_t *os, void *tag)
* same name so that it can be partially torn down and reconstructed.
*/
void
dmu_objset_refresh_ownership(objset_t *os, void *tag)
dmu_objset_refresh_ownership(objset_t *os, boolean_t decrypt, void *tag)
{
dsl_pool_t *dp;
dsl_dataset_t *ds, *newds;
@@ -724,20 +775,22 @@ dmu_objset_refresh_ownership(objset_t *os, void *tag)
dsl_dataset_name(ds, name);
dp = dmu_objset_pool(os);
dsl_pool_config_enter(dp, FTAG);
dmu_objset_disown(os, tag);
VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
dmu_objset_disown(os, decrypt, tag);
VERIFY0(dsl_dataset_own(dp, name,
(decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, &newds));
VERIFY3P(newds, ==, os->os_dsl_dataset);
dsl_pool_config_exit(dp, FTAG);
}
void
dmu_objset_disown(objset_t *os, void *tag)
dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
{
/*
* Stop upgrading thread
*/
dmu_objset_upgrade_stop(os);
dsl_dataset_disown(os->os_dsl_dataset, tag);
dsl_dataset_disown(os->os_dsl_dataset,
(decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag);
}
void
@@ -820,6 +873,8 @@ dmu_objset_evict(objset_t *os)
} else {
mutex_exit(&os->os_lock);
}
}
void
@@ -866,16 +921,20 @@ dmu_objset_snap_cmtime(objset_t *os)
return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
}
/* called from dsl for meta-objset */
objset_t *
dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dmu_objset_type_t type, dmu_tx_t *tx)
dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
{
objset_t *os;
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
if (blksz == 0)
blksz = DNODE_BLOCK_SIZE;
if (blksz == 0)
ibs = DN_MAX_INDBLKSHIFT;
if (ds != NULL)
VERIFY0(dmu_objset_from_ds(ds, &os));
else
@@ -883,8 +942,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mdn = DMU_META_DNODE(os);
dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
DNODE_MIN_SLOTS, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
@@ -898,22 +957,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* to convergence, so minimizing its dn_nlevels matters.
*/
if (ds != NULL) {
int levels = 1;
if (levels == 0) {
levels = 1;
/*
* Determine the number of levels necessary for the meta-dnode
* to contain DN_MAX_OBJECT dnodes. Note that in order to
* ensure that we do not overflow 64 bits, there has to be
* a nlevels that gives us a number of blocks > DN_MAX_OBJECT
* but < 2^64. Therefore,
* (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
* less than (64 - log2(DN_MAX_OBJECT)) (16).
*/
while ((uint64_t)mdn->dn_nblkptr <<
(mdn->dn_datablkshift - DNODE_SHIFT +
(levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
DN_MAX_OBJECT)
levels++;
/*
* Determine the number of levels necessary for the
* meta-dnode to contain DN_MAX_OBJECT dnodes. Note
* that in order to ensure that we do not overflow
* 64 bits, there has to be a nlevels that gives us a
* number of blocks > DN_MAX_OBJECT but < 2^64.
* Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
* (10) must be less than (64 - log2(DN_MAX_OBJECT))
* (16).
*/
while ((uint64_t)mdn->dn_nblkptr <<
(mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
(mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
DN_MAX_OBJECT)
levels++;
}
mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
mdn->dn_nlevels = levels;
@@ -923,7 +985,13 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES);
os->os_phys->os_type = type;
if (dmu_objset_userused_enabled(os)) {
/*
* Enable user accounting if it is enabled and this is not an
* encrypted receive.
*/
if (dmu_objset_userused_enabled(os) &&
(!os->os_encrypted || !dmu_objset_is_receiving(os))) {
os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
if (dmu_objset_userobjused_enabled(os)) {
ds->ds_feature_activation_needed[
@@ -939,6 +1007,14 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
return (os);
}
/* called from dsl for meta-objset */
objset_t *
dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dmu_objset_type_t type, dmu_tx_t *tx)
{
return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
}
typedef struct dmu_objset_create_arg {
const char *doca_name;
cred_t *doca_cred;
@@ -947,6 +1023,7 @@ typedef struct dmu_objset_create_arg {
void *doca_userarg;
dmu_objset_type_t doca_type;
uint64_t doca_flags;
dsl_crypto_params_t *doca_dcp;
} dmu_objset_create_arg_t;
/*ARGSUSED*/
@@ -972,8 +1049,16 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx)
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp);
if (error != 0) {
dsl_dir_rele(pdd, FTAG);
return (error);
}
error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
doca->doca_cred);
dsl_dir_rele(pdd, FTAG);
return (error);
@@ -990,13 +1075,15 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
uint64_t obj;
blkptr_t *bp;
objset_t *os;
zio_t *rzio;
VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
doca->doca_cred, tx);
doca->doca_cred, doca->doca_dcp, tx);
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bp = dsl_dataset_get_blkptr(ds);
os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
@@ -1008,18 +1095,56 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
doca->doca_cred, tx);
}
/*
* The doca_userfunc() will write out some data that needs to be
* encrypted if the dataset is encrypted (specifically the root
* directory). This data must be written out before the encryption
* key mapping is removed by dsl_dataset_rele_flags(). Force the
* I/O to occur immediately by invoking the relevant sections of
* dsl_pool_sync().
*/
if (os->os_encrypted) {
dsl_dataset_t *tmpds = NULL;
boolean_t need_sync_done = B_FALSE;
rzio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
tmpds = txg_list_remove(&dp->dp_dirty_datasets, tx->tx_txg);
if (tmpds != NULL) {
ASSERT3P(ds, ==, tmpds);
dsl_dataset_sync(ds, rzio, tx);
need_sync_done = B_TRUE;
}
VERIFY0(zio_wait(rzio));
dmu_objset_do_userquota_updates(os, tx);
taskq_wait(dp->dp_sync_taskq);
rzio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
tmpds = txg_list_remove(&dp->dp_dirty_datasets, tx->tx_txg);
if (tmpds != NULL) {
ASSERT3P(ds, ==, tmpds);
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, rzio, tx);
}
VERIFY0(zio_wait(rzio));
if (need_sync_done)
dsl_dataset_sync_done(ds, tx);
}
spa_history_log_internal_ds(ds, "create", tx, "");
zvol_create_minors(dp->dp_spa, doca->doca_name, B_TRUE);
dsl_dataset_rele(ds, FTAG);
dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
dsl_dir_rele(pdd, FTAG);
}
int
dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
{
dmu_objset_create_arg_t doca;
dsl_crypto_params_t tmp_dcp = { 0 };
doca.doca_name = name;
doca.doca_cred = CRED();
@@ -1028,9 +1153,19 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
doca.doca_userarg = arg;
doca.doca_type = type;
/*
* Some callers (mostly for testing) do not provide a dcp on their
* own but various code inside the sync task will require it to be
* allocated. Rather than adding NULL checks throughout this code
* or adding dummy dcp's to all of the callers we simply create a
* dummy one here and use that. This zero dcp will have the same
* effect as asking for inheritence of all encryption params.
*/
doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
return (dsl_sync_task(name,
dmu_objset_create_check, dmu_objset_create_sync, &doca,
5, ZFS_SPACE_CHECK_NORMAL));
6, ZFS_SPACE_CHECK_NORMAL));
}
typedef struct dmu_objset_clone_arg {
@@ -1070,18 +1205,29 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EDQUOT));
}
dsl_dir_rele(pdd, FTAG);
error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
if (error != 0)
if (error != 0) {
dsl_dir_rele(pdd, FTAG);
return (error);
}
/* You can only clone snapshots, not the head datasets. */
if (!origin->ds_is_snapshot) {
dsl_dataset_rele(origin, FTAG);
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EINVAL));
}
error = dmu_objset_clone_crypt_check(pdd, origin->ds_dir);
if (error != 0) {
dsl_dataset_rele(origin, FTAG);
dsl_dir_rele(pdd, FTAG);
return (error);
}
dsl_dataset_rele(origin, FTAG);
dsl_dir_rele(pdd, FTAG);
return (0);
}
@@ -1101,7 +1247,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
doca->doca_cred, tx);
doca->doca_cred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
dsl_dataset_name(origin, namebuf);
@@ -1124,7 +1270,7 @@ dmu_objset_clone(const char *clone, const char *origin)
return (dsl_sync_task(clone,
dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
5, ZFS_SPACE_CHECK_NORMAL));
6, ZFS_SPACE_CHECK_NORMAL));
}
int
@@ -1232,6 +1378,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
blkptr_t *bp = zio->io_bp;
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
uint64_t fill = 0;
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
@@ -1243,9 +1390,11 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
* objects that are stored in the objset_phys_t -- the meta
* dnode and user/group accounting objects).
*/
bp->blk_fill = 0;
for (i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
BP_SET_FILL(bp, fill);
if (os->os_dsl_dataset != NULL)
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
*os->os_rootbp = *bp;
@@ -1334,6 +1483,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
dmu_write_policy(os, NULL, 0, 0, &zp);
/*
* If we are either claiming the ZIL or doing a raw receive write out
* the os_phys_buf raw. Neither of these actions will effect the MAC
* at this point.
*/
if (arc_is_unauthenticated(os->os_phys_buf) || os->os_next_write_raw) {
ASSERT(os->os_encrypted);
os->os_next_write_raw = B_FALSE;
arc_convert_to_raw(os->os_phys_buf,
os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
DMU_OT_OBJSET, NULL, NULL, NULL);
}
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
@@ -1357,7 +1519,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
txgoff = tx->tx_txg & TXG_MASK;
if (dmu_objset_userused_enabled(os)) {
if (dmu_objset_userused_enabled(os) &&
(!os->os_encrypted || !dmu_objset_is_receiving(os))) {
/*
* We must create the list here because it uses the
* dn_dirty_link[] of this txg. But it may already
@@ -1637,6 +1800,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
if (!dmu_objset_userused_enabled(os))
return;
/* if this is a raw receive just return and handle accounting later */
if (os->os_encrypted && dmu_objset_is_receiving(os))
return;
/* Allocate the user/groupused objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
VERIFY0(zap_create_claim(os,
@@ -1716,6 +1883,18 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
if (!dmu_objset_userused_enabled(dn->dn_objset))
return;
/*
* Raw receives introduce a problem with user accounting. Raw
* receives cannot update the user accounting info because the
* user ids and the sizes are encrypted. To guarantee that we
* never end up with bad user accounting, we simply disable it
* during raw receives. We also disable this for normal receives
* so that an incremental raw receive may be done on top of an
* existing non-raw receive.
*/
if (os->os_encrypted && dmu_objset_is_receiving(os))
return;
if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
DN_ID_CHKED_SPILL)))
return;
@@ -2493,8 +2672,10 @@ EXPORT_SYMBOL(dmu_objset_ds);
EXPORT_SYMBOL(dmu_objset_type);
EXPORT_SYMBOL(dmu_objset_name);
EXPORT_SYMBOL(dmu_objset_hold);
EXPORT_SYMBOL(dmu_objset_hold_flags);
EXPORT_SYMBOL(dmu_objset_own);
EXPORT_SYMBOL(dmu_objset_rele);
EXPORT_SYMBOL(dmu_objset_rele_flags);
EXPORT_SYMBOL(dmu_objset_disown);
EXPORT_SYMBOL(dmu_objset_from_ds);
EXPORT_SYMBOL(dmu_objset_create);
@@ -2512,6 +2693,7 @@ EXPORT_SYMBOL(dmu_objset_dnodesize);
EXPORT_SYMBOL(dmu_objset_sync);
EXPORT_SYMBOL(dmu_objset_is_dirty);
EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
EXPORT_SYMBOL(dmu_objset_create_impl);
EXPORT_SYMBOL(dmu_objset_open_impl);
EXPORT_SYMBOL(dmu_objset_evict);
+681 -172
View File
File diff suppressed because it is too large Load Diff
+35 -8
View File
@@ -132,7 +132,7 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh)
zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
claim_txg);
claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
zil_free(zilog);
}
@@ -181,6 +181,7 @@ traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
@@ -196,8 +197,11 @@ traverse_prefetch_metadata(traverse_data_t *td,
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return;
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
}
static boolean_t
@@ -294,6 +298,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
zbookmark_phys_t *czb;
ASSERT(!BP_IS_PROTECTED(bp));
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
@@ -324,14 +330,23 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_FLAG_WAIT;
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
dnode_phys_t *child_dnp;
/*
* dnode blocks might have their bonus buffers encrypted, so
* we must be careful to honor TRAVERSE_NO_DECRYPT
*/
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err != 0)
goto post;
child_dnp = buf->b_data;
for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
@@ -347,11 +362,15 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err != 0)
goto post;
@@ -500,6 +519,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0);
@@ -518,8 +538,11 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);
if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;
(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
zio_flags, &aflags, zb);
return (0);
}
@@ -599,13 +622,17 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;
err = arc_read(NULL, td->td_spa, rootbp,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
BP_IS_PROTECTED(rootbp))
zio_flags |= ZIO_FLAG_RAW;
err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
&buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
if (err != 0)
return (err);
+73 -38
View File
@@ -1246,7 +1246,12 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
rw_exit(&mdn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
err = dbuf_read(db, NULL, DB_RF_CANFAIL);
/*
* We do not need to decrypt to read the dnode so it doesn't matter
* if we get the encrypted or decrypted version.
*/
err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
if (err) {
dbuf_rele(db, FTAG);
return (err);
@@ -1550,11 +1555,73 @@ fail:
return (SET_ERROR(ENOTSUP));
}
static void
dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
list_t *list;
dbuf_dirty_record_t *new, *dr, *dr_next;
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
dn->dn_nlevels = new_nlevels;
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;
/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
/* transfer the dirty records to the new indirect */
mutex_enter(&dn->dn_mtx);
mutex_enter(&new->dt.di.dr_mtx);
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
}
}
mutex_exit(&new->dt.di.dr_mtx);
mutex_exit(&dn->dn_mtx);
}
int
dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
{
int ret = 0;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_nlevels == nlevels) {
ret = 0;
goto out;
} else if (nlevels < dn->dn_nlevels) {
ret = SET_ERROR(EINVAL);
goto out;
}
dnode_set_nlevels_impl(dn, nlevels, tx);
out:
rw_exit(&dn->dn_struct_rwlock);
return (ret);
}
/* read-holding callers must not rely on the lock being continuously held */
void
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
int epbs, new_nlevels;
uint64_t sz;
@@ -1594,41 +1661,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
if (new_nlevels > dn->dn_nlevels) {
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
list_t *list;
dbuf_dirty_record_t *new, *dr, *dr_next;
dn->dn_nlevels = new_nlevels;
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;
/* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
ASSERT(db != NULL);
new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
/* transfer the dirty records to the new indirect */
mutex_enter(&dn->dn_mtx);
mutex_enter(&new->dt.di.dr_mtx);
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
}
}
mutex_exit(&new->dt.di.dr_mtx);
mutex_exit(&dn->dn_mtx);
}
if (new_nlevels > dn->dn_nlevels)
dnode_set_nlevels_impl(dn, new_nlevels, tx);
out:
if (have_read)
@@ -1987,7 +2021,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*/
return (SET_ERROR(ESRCH));
}
error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
error = dbuf_read(db, NULL,
DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
if (error) {
dbuf_rele(db, FTAG);
return (error);
+10 -3
View File
@@ -31,6 +31,7 @@
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_send.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/range_tree.h>
@@ -557,6 +558,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
@@ -572,8 +574,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
if (dmu_objset_userused_enabled(dn->dn_objset) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
/*
* Do user accounting if it is enabled and this is not
* an encrypted receive.
*/
if (dmu_objset_userused_enabled(os) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
(!os->os_encrypted || !dmu_objset_is_receiving(os))) {
mutex_enter(&dn->dn_mtx);
dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
dn->dn_oldflags = dn->dn_phys->dn_flags;
@@ -584,7 +591,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
} else {
/* Once we account for it, we should always account for it. */
/* Once we account for it, we should always account for it */
ASSERT(!(dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED));
ASSERT(!(dn->dn_phys->dn_flags &
File diff suppressed because it is too large Load Diff
+93 -30
View File
@@ -386,8 +386,8 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
}
int
dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **dsp)
dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
{
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
@@ -548,11 +548,27 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
*dsp = ds;
if ((flags & DS_HOLD_FLAG_DECRYPT) && ds->ds_dir->dd_crypto_obj != 0) {
err = spa_keystore_create_mapping(dp->dp_spa, ds, ds);
if (err != 0) {
dsl_dataset_rele(ds, tag);
return (SET_ERROR(EACCES));
}
}
return (0);
}
int
dsl_dataset_hold(dsl_pool_t *dp, const char *name,
dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **dsp)
{
return (dsl_dataset_hold_obj_flags(dp, dsobj, 0, tag, dsp));
}
int
dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
@@ -568,7 +584,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);
else
err = SET_ERROR(ENOENT);
@@ -577,16 +593,18 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
dsl_dataset_t *snap_ds;
if (*snapname++ != '@') {
dsl_dataset_rele(ds, tag);
dsl_dataset_rele_flags(ds, flags, tag);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT));
}
dprintf("looking for snapshot '%s'\n", snapname);
err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
dsl_dataset_rele(ds, tag);
if (err == 0) {
err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,
&snap_ds);
}
dsl_dataset_rele_flags(ds, flags, tag);
if (err == 0) {
mutex_enter(&snap_ds->ds_lock);
@@ -604,14 +622,21 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
}
int
dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dataset_t **dsp)
{
return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
}
int
dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
dsl_dataset_rele(*dsp, tag);
dsl_dataset_rele_flags(*dsp, flags, tag);
*dsp = NULL;
return (SET_ERROR(EBUSY));
}
@@ -619,14 +644,14 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
}
int
dsl_dataset_own(dsl_pool_t *dp, const char *name,
dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
void *tag, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold(dp, name, tag, dsp);
int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
if (err != 0)
return (err);
if (!dsl_dataset_tryown(*dsp, tag)) {
dsl_dataset_rele(*dsp, tag);
dsl_dataset_rele_flags(*dsp, flags, tag);
return (SET_ERROR(EBUSY));
}
return (0);
@@ -707,13 +732,25 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
}
void
dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
{
if (ds->ds_dir != NULL && ds->ds_dir->dd_crypto_obj != 0 &&
(flags & DS_HOLD_FLAG_DECRYPT)) {
(void) spa_keystore_remove_mapping(ds->ds_dir->dd_pool->dp_spa,
ds->ds_object, ds);
}
dmu_buf_rele(ds->ds_dbuf, tag);
}
void
dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
{
dsl_dataset_rele_flags(ds, 0, tag);
}
void
dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
{
ASSERT3P(ds->ds_owner, ==, tag);
ASSERT(ds->ds_dbuf != NULL);
@@ -722,7 +759,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
ds->ds_owner = NULL;
mutex_exit(&ds->ds_lock);
dsl_dataset_long_rele(ds, tag);
dsl_dataset_rele(ds, tag);
dsl_dataset_rele_flags(ds, flags, tag);
}
boolean_t
@@ -751,7 +788,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds)
return (rv);
}
static void
void
dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
@@ -781,7 +818,7 @@ dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
uint64_t
dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx)
dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)
{
dsl_pool_t *dp = dd->dd_pool;
dmu_buf_t *dbuf;
@@ -881,6 +918,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
}
}
/* handle encryption */
dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
@@ -903,6 +943,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
zio_t *zio;
bzero(&os->os_zil_header, sizeof (os->os_zil_header));
if (os->os_encrypted)
os->os_next_write_raw = B_TRUE;
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dsl_dataset_sync(ds, zio, tx);
@@ -916,7 +958,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
uint64_t
dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_t *origin, uint64_t flags, cred_t *cr,
dsl_crypto_params_t *dcp, dmu_tx_t *tx)
{
dsl_pool_t *dp = pdd->dd_pool;
uint64_t dsobj, ddobj;
@@ -928,7 +971,7 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
dsobj = dsl_dataset_create_sync_dd(dd, origin,
dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,
flags & ~DS_CREATE_FLAG_NODIRTY, tx);
dsl_deleg_set_create_perms(dd, tx, cr);
@@ -1821,6 +1864,10 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
DS_FIELD_RESUME_COMPRESSOK) == 0) {
fnvlist_add_boolean(token_nv, "compressok");
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_RAWOK) == 0) {
fnvlist_add_boolean(token_nv, "rawok");
}
packed = fnvlist_pack(token_nv, &packed_size);
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
@@ -1851,6 +1898,7 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
int err;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
uint64_t refd, avail, uobjs, aobjs, ratio;
@@ -1901,12 +1949,12 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
ds->ds_userrefs);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
dsl_dataset_crypt_stats(ds, nv);
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
uint64_t written, comp, uncomp;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_dataset_t *prev;
int err;
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
@@ -2340,7 +2388,7 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
fnvlist_add_string(ddra->ddra_result, "target", namebuf);
cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
@@ -2427,6 +2475,23 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EXDEV));
}
snap = list_head(&ddpa->shared_snaps);
if (snap == NULL) {
err = SET_ERROR(ENOENT);
goto out;
}
origin_ds = snap->ds;
/*
* Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
* When doing a promote we must make sure the encryption root for
* both the target and the target's origin does not change to avoid
* needing to rewrap encryption keys
*/
err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);
if (err != 0)
goto out;
/*
* Compute and check the amount of space to transfer. Since this is
* so expensive, don't do the preliminary check.
@@ -2436,13 +2501,6 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
return (0);
}
snap = list_head(&ddpa->shared_snaps);
if (snap == NULL) {
err = SET_ERROR(ENOENT);
goto out;
}
origin_ds = snap->ds;
/* compute origin's new unique space */
snap = list_tail(&ddpa->clone_snaps);
ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
@@ -2611,6 +2669,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
NULL, FTAG, &odd));
dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);
/* change origin's next snap */
dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
@@ -3692,11 +3752,14 @@ MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
#endif
EXPORT_SYMBOL(dsl_dataset_hold);
EXPORT_SYMBOL(dsl_dataset_hold_flags);
EXPORT_SYMBOL(dsl_dataset_hold_obj);
EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);
EXPORT_SYMBOL(dsl_dataset_own);
EXPORT_SYMBOL(dsl_dataset_own_obj);
EXPORT_SYMBOL(dsl_dataset_name);
EXPORT_SYMBOL(dsl_dataset_rele);
EXPORT_SYMBOL(dsl_dataset_rele_flags);
EXPORT_SYMBOL(dsl_dataset_disown);
EXPORT_SYMBOL(dsl_dataset_tryown);
EXPORT_SYMBOL(dsl_dataset_create_sync);
+10 -4
View File
@@ -598,8 +598,8 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
ka.ds = ds;
ka.tx = tx;
VERIFY0(traverse_dataset(ds,
dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
kill_blkptr, &ka));
dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST |
TRAVERSE_NO_DECRYPT, kill_blkptr, &ka));
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == 0);
}
@@ -706,6 +706,11 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
for (t = 0; t < DD_USED_NUM; t++)
ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
if (dd->dd_crypto_obj != 0) {
dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx);
(void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
}
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
@@ -951,7 +956,8 @@ dsl_destroy_head(const char *name)
* remove the objects from open context so that the txg sync
* is not too long.
*/
error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE,
FTAG, &os);
if (error == 0) {
uint64_t obj;
uint64_t prev_snap_txg =
@@ -963,7 +969,7 @@ dsl_destroy_head(const char *name)
(void) dmu_free_long_object(os, obj);
/* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0);
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_FALSE, FTAG);
}
}
+30 -13
View File
@@ -159,6 +159,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
{
dmu_buf_t *dbuf;
dsl_dir_t *dd;
dmu_object_info_t doi;
int err;
ASSERT(dsl_pool_config_held(dp));
@@ -167,14 +168,11 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
if (err != 0)
return (err);
dd = dmu_buf_get_user(dbuf);
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
dmu_object_info_from_db(dbuf, &doi);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
}
#endif
dmu_object_info_from_db(dbuf, &doi);
ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
if (dd == NULL) {
dsl_dir_t *winner;
@@ -182,6 +180,15 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_object = ddobj;
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;
if (dsl_dir_is_zapified(dd) &&
zap_contains(dp->dp_meta_objset, ddobj,
DD_FIELD_CRYPTO_KEY_OBJ) == 0) {
VERIFY0(zap_lookup(dp->dp_meta_objset,
ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
sizeof (uint64_t), 1, &dd->dd_crypto_obj));
}
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
dsl_prop_init(dd);
@@ -918,6 +925,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
@@ -935,6 +943,8 @@ dsl_dir_is_clone(dsl_dir_t *dd)
void
dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
{
uint64_t intval;
mutex_enter(&dd->dd_lock);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
dsl_dir_phys(dd)->dd_used_bytes);
@@ -962,18 +972,17 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
mutex_exit(&dd->dd_lock);
if (dsl_dir_is_zapified(dd)) {
uint64_t count;
objset_t *os = dd->dd_pool->dp_meta_objset;
if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
sizeof (count), 1, &count) == 0) {
sizeof (intval), 1, &intval) == 0) {
dsl_prop_nvlist_add_uint64(nv,
ZFS_PROP_FILESYSTEM_COUNT, count);
ZFS_PROP_FILESYSTEM_COUNT, intval);
}
if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
sizeof (count), 1, &count) == 0) {
sizeof (intval), 1, &intval) == 0) {
dsl_prop_nvlist_add_uint64(nv,
ZFS_PROP_SNAPSHOT_COUNT, count);
ZFS_PROP_SNAPSHOT_COUNT, intval);
}
}
@@ -1814,6 +1823,14 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
}
}
/* check for encryption errors */
error = dsl_dir_rename_crypt_check(dd, newparent);
if (error != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(EACCES));
}
/* no rename into our descendant */
if (closest_common_ancestor(dd, newparent) == dd) {
dsl_dir_rele(newparent, FTAG);
+16 -3
View File
@@ -359,7 +359,8 @@ dsl_pool_close(dsl_pool_t *dp)
}
dsl_pool_t *
dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
uint64_t txg)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
@@ -373,6 +374,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
/* create and open the MOS (meta-objset) */
dp->dp_meta_objset = dmu_objset_create_impl(spa,
NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
spa->spa_meta_objset = dp->dp_meta_objset;
/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -410,8 +412,19 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);
/*
* Some features may be needed when creating the root dataset, so we
* create the feature objects here.
*/
if (spa_version(spa) >= SPA_VERSION_FEATURES)
spa_feature_create_zap_objects(spa, tx);
if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
dcp->cp_crypt != ZIO_CRYPT_INHERIT)
spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);
/* create the root dataset */
obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
/* create the root objset */
VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
@@ -865,7 +878,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
/* create the origin dir, ds, & snap-ds */
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
NULL, 0, kcred, tx);
NULL, 0, kcred, NULL, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
+2 -1
View File
@@ -963,7 +963,7 @@ typedef enum dsl_prop_getflags {
DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */
DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */
DSL_PROP_GET_LOCAL = 0x4, /* local properties */
DSL_PROP_GET_RECEIVED = 0x8 /* received properties */
DSL_PROP_GET_RECEIVED = 0x8, /* received properties */
} dsl_prop_getflags_t;
static int
@@ -1130,6 +1130,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
if (err)
break;
}
out:
if (err) {
nvlist_free(*nvp);
+14 -3
View File
@@ -683,7 +683,7 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
zilog = zil_alloc(dp->dp_meta_objset, zh);
(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
claim_txg);
claim_txg, B_FALSE);
zil_free(zilog);
}
@@ -695,6 +695,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
{
zbookmark_phys_t czb;
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
if (zfs_no_scrub_prefetch)
return;
@@ -703,11 +704,16 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
return;
if (BP_IS_PROTECTED(bp)) {
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
zio_flags |= ZIO_FLAG_RAW;
}
SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb);
}
static boolean_t
@@ -793,6 +799,11 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_buf_t *buf;
if (BP_IS_PROTECTED(bp)) {
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
zio_flags |= ZIO_FLAG_RAW;
}
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
+66 -19
View File
@@ -1169,6 +1169,8 @@ spa_activate(spa_t *spa, int mode)
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
spa_keystore_init(&spa->spa_keystore);
/*
* This taskq is used to perform zvol-minor-related tasks
* asynchronously. This has several advantages, including easy
@@ -1246,10 +1248,11 @@ spa_deactivate(spa_t *spa)
* still have errors left in the queues. Empty them just in case.
*/
spa_errlog_drain(spa);
avl_destroy(&spa->spa_errlist_scrub);
avl_destroy(&spa->spa_errlist_last);
spa_keystore_fini(&spa->spa_keystore);
spa->spa_state = POOL_STATE_UNINITIALIZED;
mutex_enter(&spa->spa_proc_lock);
@@ -2094,8 +2097,8 @@ spa_load_verify(spa_t *spa)
if (spa_load_verify_metadata) {
error = traverse_pool(spa, spa->spa_verify_min_txg,
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
spa_load_verify_cb, rio);
TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
}
(void) zio_wait(rio);
@@ -2301,7 +2304,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
spa->spa_loaded_ts.tv_nsec = 0;
}
if (error != EBADF) {
zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0);
}
}
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
@@ -3978,12 +3981,28 @@ spa_l2cache_drop(spa_t *spa)
}
}
/*
* Verify encryption parameters for spa creation. If we are encrypting, we must
* have the encryption feature flag enabled.
*/
static int
spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
boolean_t has_encryption)
{
if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
!has_encryption)
return (SET_ERROR(ENOTSUP));
return (dmu_objset_create_crypt_check(NULL, dcp));
}
/*
* Pool Creation
*/
int
spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
nvlist_t *zplprops)
nvlist_t *zplprops, dsl_crypto_params_t *dcp)
{
spa_t *spa;
char *altroot = NULL;
@@ -3994,8 +4013,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
uint64_t version, obj;
uint64_t version, obj, root_dsobj = 0;
boolean_t has_features;
boolean_t has_encryption;
spa_feature_t feat;
char *feat_name;
nvpair_t *elem;
int c, i;
char *poolname;
@@ -4038,10 +4060,28 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
has_features = B_FALSE;
has_encryption = B_FALSE;
for (elem = nvlist_next_nvpair(props, NULL);
elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
if (zpool_prop_feature(nvpair_name(elem)))
if (zpool_prop_feature(nvpair_name(elem))) {
has_features = B_TRUE;
feat_name = strchr(nvpair_name(elem), '@') + 1;
VERIFY0(zfeature_lookup_name(feat_name, &feat));
if (feat == SPA_FEATURE_ENCRYPTION)
has_encryption = B_TRUE;
}
}
/* verify encryption params, if they were provided */
if (dcp != NULL) {
error = spa_create_check_encryption_params(dcp, has_encryption);
if (error != 0) {
spa_deactivate(spa);
spa_remove(spa);
mutex_exit(&spa_namespace_lock);
return (error);
}
}
if (has_features || nvlist_lookup_uint64(props,
@@ -4131,8 +4171,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
}
spa->spa_is_initializing = B_TRUE;
spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
spa->spa_meta_objset = dp->dp_meta_objset;
spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
spa->spa_is_initializing = B_FALSE;
/*
@@ -4157,9 +4196,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
cmn_err(CE_PANIC, "failed to add pool config");
}
if (spa_version(spa) >= SPA_VERSION_FEATURES)
spa_feature_create_zap_objects(spa, tx);
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
sizeof (uint64_t), 1, &version, tx) != 0) {
@@ -4220,15 +4256,26 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
mmp_thread_start(spa);
/*
* We explicitly wait for the first transaction to complete so that our
* bean counters are appropriately updated.
* If the root dataset is encrypted we will need to create key mappings
* for the zio layer before we start to write any data to disk and hold
* them until after the first txg has been synced. Waiting for the first
* transaction to complete also ensures that our bean counters are
* appropriately updated.
*/
txg_wait_synced(spa->spa_dsl_pool, txg);
if (dp->dp_root_dir->dd_crypto_obj != 0) {
root_dsobj = dsl_dir_phys(dp->dp_root_dir)->dd_head_dataset_obj;
VERIFY0(spa_keystore_create_mapping_impl(spa, root_dsobj,
dp->dp_root_dir, FTAG));
}
spa->spa_sync_on = B_TRUE;
txg_sync_start(dp);
mmp_thread_start(spa);
txg_wait_synced(dp, txg);
if (dp->dp_root_dir->dd_crypto_obj != 0)
VERIFY0(spa_keystore_remove_mapping(spa, root_dsobj, FTAG));
spa_config_sync(spa, B_FALSE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
+1 -1
View File
@@ -305,7 +305,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
*/
if (target->spa_ccw_fail_time == 0) {
zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
target, NULL, NULL, 0, 0);
target, NULL, NULL, NULL, 0, 0);
}
target->spa_ccw_fail_time = gethrtime();
spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
+1 -2
View File
@@ -90,9 +90,8 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
* during spa_errlog_sync().
*/
void
spa_log_error(spa_t *spa, zio_t *zio)
spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
{
zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
spa_error_entry_t search;
spa_error_entry_t *new;
avl_tree_t *tree;
+6 -1
View File
@@ -385,11 +385,16 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
{
int err = 0;
dmu_tx_t *tx;
nvlist_t *nvarg;
nvlist_t *nvarg, *in_nvl = NULL;
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
return (SET_ERROR(EINVAL));
err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl);
if (err == 0) {
(void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS);
}
tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
+11 -1
View File
@@ -1414,6 +1414,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
char type[256];
char *checksum = NULL;
char *compress = NULL;
char *crypt_type = NULL;
if (bp != NULL) {
if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
@@ -1427,6 +1428,15 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
if (BP_IS_ENCRYPTED(bp)) {
crypt_type = "encrypted";
} else if (BP_IS_AUTHENTICATED(bp)) {
crypt_type = "authenticated";
} else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
crypt_type = "indirect-MAC";
} else {
crypt_type = "unencrypted";
}
if (!BP_IS_EMBEDDED(bp)) {
checksum =
zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
@@ -1435,7 +1445,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
}
SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
compress);
crypt_type, compress);
}
void
+5 -4
View File
@@ -1050,7 +1050,7 @@ vdev_probe_done(zio_t *zio)
} else {
ASSERT(zio->io_error != 0);
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, 0, 0);
spa, vd, NULL, NULL, 0, 0);
zio->io_error = SET_ERROR(ENXIO);
}
@@ -1397,7 +1397,7 @@ vdev_open(vdev_t *vd)
if (ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
spa, vd, NULL, 0, 0);
spa, vd, NULL, NULL, 0, 0);
}
vd->vdev_max_asize = max_asize;
@@ -3590,7 +3590,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
zfs_ereport_post(class, spa, vd, NULL, NULL,
save_state, 0);
}
/* Erase any notion of persistent removed state */
@@ -3758,7 +3759,7 @@ vdev_deadman(vdev_t *vd)
fio->io_timestamp, delta,
vq->vq_io_complete_ts);
zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
spa, vd, fio, 0, 0);
spa, vd, &fio->io_bookmark, fio, 0, 0);
}
}
mutex_exit(&vq->vq_lock);
+5 -4
View File
@@ -1766,9 +1766,9 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data,
&zbc);
zfs_ereport_post_checksum(zio->io_spa, vd,
&zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
rc->rc_abd, bad_data, &zbc);
}
}
@@ -2256,7 +2256,8 @@ vdev_raidz_io_done(zio_t *zio)
zfs_ereport_start_checksum(
zio->io_spa,
vd->vdev_child[rc->rc_devidx],
zio, rc->rc_offset, rc->rc_size,
&zio->io_bookmark, zio,
rc->rc_offset, rc->rc_size,
(void *)(uintptr_t)c, &zbc);
}
}
+2 -2
View File
@@ -424,8 +424,8 @@ spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
* We create feature flags ZAP objects in two instances: during pool
* creation and during pool upgrade.
*/
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
tx->tx_txg == TXG_INITIAL));
ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) ||
dsl_pool_sync_context(spa_get_dsl(spa)));
spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+1 -1
View File
@@ -2204,7 +2204,7 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
* placed into the working_mode, giving the caller a mask of denied
* accesses. Returns:
* 0 if all AoI granted
* EACCESS if the denied mask is non-zero
* EACCES if the denied mask is non-zero
* other error if abnormal failure (e.g., IO error)
*
* A secondary usage of the function is to determine if any of the
+25 -29
View File
@@ -142,8 +142,8 @@ zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
static void
zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
uint64_t stateoroffset, uint64_t size)
const char *subclass, spa_t *spa, vdev_t *vd, zbookmark_phys_t *zb,
zio_t *zio, uint64_t stateoroffset, uint64_t size)
{
nvlist_t *ereport, *detector;
@@ -413,24 +413,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
DATA_TYPE_UINT64, zio->io_size, NULL);
}
/*
* Payload for I/Os with corresponding logical information.
*/
if (zio->io_logical != NULL)
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
DATA_TYPE_UINT64,
zio->io_logical->io_bookmark.zb_objset,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
DATA_TYPE_UINT64,
zio->io_logical->io_bookmark.zb_object,
FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
DATA_TYPE_INT64,
zio->io_logical->io_bookmark.zb_level,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
DATA_TYPE_UINT64,
zio->io_logical->io_bookmark.zb_blkid, NULL);
} else if (vd != NULL) {
/*
* If we have a vdev but no zio, this is a device fault, and the
@@ -442,6 +424,20 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, stateoroffset, NULL);
}
/*
* Payload for I/Os with corresponding logical information.
*/
if (zb != NULL && (zio == NULL || zio->io_logical != NULL))
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
DATA_TYPE_UINT64, zb->zb_objset,
FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
DATA_TYPE_UINT64, zb->zb_object,
FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
DATA_TYPE_INT64, zb->zb_level,
FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
DATA_TYPE_UINT64, zb->zb_blkid, NULL);
mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport;
@@ -771,8 +767,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
#endif
void
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
uint64_t stateoroffset, uint64_t size)
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size)
{
#ifdef _KERNEL
nvlist_t *ereport = NULL;
@@ -781,8 +777,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
if (zfs_is_ratelimiting_event(subclass, vd))
return;
zfs_ereport_start(&ereport, &detector,
subclass, spa, vd, zio, stateoroffset, size);
zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
zb, zio, stateoroffset, size);
if (ereport == NULL)
return;
@@ -793,7 +789,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
}
void
zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length, void *arg,
zio_bad_cksum_t *info)
{
@@ -823,7 +819,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
#ifdef _KERNEL
zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
if (report->zcr_ereport == NULL) {
zfs_ereport_free_checksum(report);
@@ -879,7 +875,7 @@ zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
void
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, zbookmark_phys_t *zb,
struct zio *zio, uint64_t offset, uint64_t length,
const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
{
@@ -888,8 +884,8 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
nvlist_t *detector = NULL;
zfs_ecksum_info_t *info;
zfs_ereport_start(&ereport, &detector,
FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
spa, vd, zb, zio, offset, length);
if (ereport == NULL)
return;
+235 -33
View File
@@ -34,7 +34,7 @@
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017 Datto Inc. All rights reserved.
* Copyright 2017 RackTop Systems.
*/
@@ -185,6 +185,7 @@
#include <sys/dsl_scan.h>
#include <sharefs/share.h>
#include <sys/fm/util.h>
#include <sys/dsl_crypt.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
@@ -565,12 +566,12 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
* Try to own the dataset; abort if there is any error,
* (e.g., already mounted, in use, or other error).
*/
error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE,
setsl_tag, &os);
if (error != 0)
return (SET_ERROR(EPERM));
dmu_objset_disown(os, setsl_tag);
dmu_objset_disown(os, B_TRUE, setsl_tag);
if (new_default) {
needed_priv = PRIV_FILE_DOWNGRADE_SL;
@@ -1301,6 +1302,20 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (error);
}
static int
zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_LOAD_KEY, cr));
}
static int
zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_CHANGE_KEY, cr));
}
/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
@@ -1462,7 +1477,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
if (zfsvfs->z_sb) {
deactivate_super(zfsvfs->z_sb);
} else {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
zfsvfs_free(zfsvfs);
}
}
@@ -1474,6 +1489,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
nvlist_t *config, *props = NULL;
nvlist_t *rootprops = NULL;
nvlist_t *zplprops = NULL;
dsl_crypto_params_t *dcp = NULL;
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)))
@@ -1488,6 +1504,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
if (props) {
nvlist_t *nvl = NULL;
nvlist_t *hidden_args = NULL;
uint64_t version = SPA_VERSION;
(void) nvlist_lookup_uint64(props,
@@ -1506,6 +1523,18 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
}
(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
}
(void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS,
&hidden_args);
error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
rootprops, hidden_args, &dcp);
if (error != 0) {
nvlist_free(config);
nvlist_free(props);
return (error);
}
(void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS);
VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
error = zfs_fill_zplprops_root(version, rootprops,
zplprops, NULL);
@@ -1513,7 +1542,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
goto pool_props_bad;
}
error = spa_create(zc->zc_name, config, props, zplprops);
error = spa_create(zc->zc_name, config, props, zplprops, dcp);
/*
* Set the remaining root properties
@@ -1527,6 +1556,7 @@ pool_props_bad:
nvlist_free(zplprops);
nvlist_free(config);
nvlist_free(props);
dsl_crypto_params_free(dcp, !!error);
return (error);
}
@@ -1802,15 +1832,16 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
int error;
/* XXX reading from objset not owned */
if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
sizeof (zc->zc_value));
dmu_objset_rele(os, FTAG);
dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (error);
}
@@ -1831,15 +1862,16 @@ zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
int error;
/* XXX reading from objset not owned */
if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
FTAG, &os)) != 0)
return (error);
if (dmu_objset_type(os) != DMU_OST_ZFS) {
dmu_objset_rele(os, FTAG);
dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (SET_ERROR(EINVAL));
}
error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
sizeof (zc->zc_value));
dmu_objset_rele(os, FTAG);
dmu_objset_rele_flags(os, B_TRUE, FTAG);
return (error);
}
@@ -2385,7 +2417,8 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
{
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
uint64_t intval = 0;
char *strval = NULL;
int err = -1;
if (prop == ZPROP_INVAL) {
@@ -2401,10 +2434,12 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
&pair) == 0);
}
if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
return (-1);
VERIFY(0 == nvpair_value_uint64(pair, &intval));
/* all special properties are numeric except for keylocation */
if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
strval = fnvpair_value_string(pair);
} else {
intval = fnvpair_value_uint64(pair);
}
switch (prop) {
case ZFS_PROP_QUOTA:
@@ -2421,6 +2456,16 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
} else {
err = dsl_dir_activate_fs_ss_limit(dsname);
}
/*
* Set err to -1 to force the zfs_set_prop_nvlist code down the
* default path to set the value in the nvlist.
*/
if (err == 0)
err = -1;
break;
case ZFS_PROP_KEYLOCATION:
err = dsl_crypto_can_set_keylocation(dsname, strval);
/*
* Set err to -1 to force the zfs_set_prop_nvlist code down the
* default path to set the value in the nvlist.
@@ -3156,6 +3201,8 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
* innvl: {
* "type" -> dmu_objset_type_t (int32)
* (optional) "props" -> { prop -> value }
* (optional) "hidden_args" -> { "wkeydata" -> value }
* raw uint8_t array of encryption wrapping key data (32 bytes)
* }
*
* outnvl: propname -> error code (int32)
@@ -3166,15 +3213,18 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
int error = 0;
zfs_creat_t zct = { 0 };
nvlist_t *nvprops = NULL;
nvlist_t *hidden_args = NULL;
void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
int32_t type32;
dmu_objset_type_t type;
boolean_t is_insensitive = B_FALSE;
dsl_crypto_params_t *dcp = NULL;
if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
return (SET_ERROR(EINVAL));
type = type32;
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
switch (type) {
case DMU_OST_ZFS:
@@ -3240,9 +3290,18 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
}
}
error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops,
hidden_args, &dcp);
if (error != 0) {
nvlist_free(zct.zct_zplprops);
return (error);
}
error = dmu_objset_create(fsname, type,
is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct);
nvlist_free(zct.zct_zplprops);
dsl_crypto_params_free(dcp, !!error);
/*
* It would be nice to do this atomically.
@@ -3277,6 +3336,8 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "origin" -> name of origin snapshot
* (optional) "props" -> { prop -> value }
* (optional) "hidden_args" -> { "wkeydata" -> value }
* raw uint8_t array of encryption wrapping key data (32 bytes)
* }
*
* outputs:
@@ -3299,9 +3360,8 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
if (dataset_namecheck(origin_name, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
error = dmu_objset_clone(fsname, origin_name);
if (error != 0)
return (error);
/*
* It would be nice to do this atomically.
@@ -4160,7 +4220,11 @@ extract_delay_props(nvlist_t *props)
{
nvlist_t *delayprops;
nvpair_t *nvp, *tmp;
static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
static const zfs_prop_t delayable[] = {
ZFS_PROP_REFQUOTA,
ZFS_PROP_KEYLOCATION,
0
};
int i;
VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -4704,6 +4768,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
boolean_t compressok = (zc->zc_flags & 0x4);
boolean_t rawok = (zc->zc_flags & 0x8);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4735,7 +4800,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
if (error != 0)
return (error);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
error = dsl_dataset_hold_obj(dp, zc->zc_sendobj,
FTAG, &tosnap);
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
@@ -4751,7 +4817,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
}
}
error = dmu_send_estimate(tosnap, fromsnap, compressok,
error = dmu_send_estimate(tosnap, fromsnap, compressok || rawok,
&zc->zc_objset_type);
if (fromsnap != NULL)
@@ -4765,7 +4831,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
zc->zc_fromobj, embedok, large_block_ok, compressok,
zc->zc_fromobj, embedok, large_block_ok, compressok, rawok,
zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
@@ -5152,7 +5218,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
dmu_objset_refresh_ownership(zfsvfs->z_os,
zfsvfs);
B_TRUE, zfsvfs);
error = zfs_resume_fs(zfsvfs, ds);
}
}
@@ -5161,12 +5227,12 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
deactivate_super(zfsvfs->z_sb);
} else {
/* XXX kind of reading contents without owning */
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
if (error != 0)
return (error);
error = dmu_objset_userspace_upgrade(os);
dmu_objset_rele(os, FTAG);
dmu_objset_rele_flags(os, B_TRUE, FTAG);
}
return (error);
@@ -5185,7 +5251,7 @@ zfs_ioc_userobjspace_upgrade(zfs_cmd_t *zc)
objset_t *os;
int error;
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
if (error != 0)
return (error);
@@ -5209,7 +5275,7 @@ zfs_ioc_userobjspace_upgrade(zfs_cmd_t *zc)
}
dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
dsl_dataset_rele(dmu_objset_ds(os), FTAG);
dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG);
return (error);
}
@@ -5745,6 +5811,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* (optional) "compressok" -> (value ignored)
* presence indicates compressed DRR_WRITE records are permitted
* (optional) "rawok" -> (value ignored)
* presence indicates raw encrypted records should be used.
* (optional) "resume_object" and "resume_offset" -> (uint64)
* if present, resume send stream from specified object and offset.
* }
@@ -5763,6 +5831,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
boolean_t largeblockok;
boolean_t embedok;
boolean_t compressok;
boolean_t rawok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
@@ -5775,6 +5844,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
compressok = nvlist_exists(innvl, "compressok");
rawok = nvlist_exists(innvl, "rawok");
(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
@@ -5784,7 +5854,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
off = fp->f_offset;
error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
fd, resumeobj, resumeoff, fp->f_vnode, &off);
rawok, fd, resumeobj, resumeoff, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5824,6 +5894,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
/* LINTED E_FUNC_SET_NOT_USED */
boolean_t embedok;
boolean_t compressok;
boolean_t rawok;
uint64_t space;
error = dsl_pool_hold(snapname, FTAG, &dp);
@@ -5839,6 +5910,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
compressok = nvlist_exists(innvl, "compressok");
rawok = nvlist_exists(innvl, "rawok");
error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
@@ -5852,8 +5924,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
if (error != 0)
goto out;
error = dmu_send_estimate(tosnap, fromsnap, compressok,
&space);
error = dmu_send_estimate(tosnap, fromsnap,
compressok || rawok, &space);
dsl_dataset_rele(fromsnap, FTAG);
} else if (strchr(fromname, '#') != NULL) {
/*
@@ -5868,7 +5940,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
if (error != 0)
goto out;
error = dmu_send_estimate_from_txg(tosnap,
frombm.zbm_creation_txg, compressok, &space);
frombm.zbm_creation_txg, compressok || rawok,
&space);
} else {
/*
* from is not properly formatted as a snapshot or
@@ -5879,7 +5952,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
}
} else {
// If estimating the size of a full send, use dmu_send_estimate
error = dmu_send_estimate(tosnap, NULL, compressok, &space);
error = dmu_send_estimate(tosnap, NULL, compressok || rawok,
&space);
}
fnvlist_add_uint64(outnvl, "space", space);
@@ -5928,6 +6002,124 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
return (err);
}
/*
* Load a user's wrapping key into the kernel.
* innvl: {
* "hidden_args" -> { "wkeydata" -> value }
* raw uint8_t array of encryption wrapping key data (32 bytes)
* (optional) "noop" -> (value ignored)
* presence indicated key should only be verified, not loaded
* }
*/
/* ARGSUSED */
static int
zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int ret;
dsl_crypto_params_t *dcp = NULL;
nvlist_t *hidden_args;
boolean_t noop = nvlist_exists(innvl, "noop");
if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
ret = SET_ERROR(EINVAL);
goto error;
}
ret = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
if (ret != 0) {
ret = SET_ERROR(EINVAL);
goto error;
}
ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
hidden_args, &dcp);
if (ret != 0)
goto error;
ret = spa_keystore_load_wkey(dsname, dcp, noop);
if (ret != 0)
goto error;
dsl_crypto_params_free(dcp, noop);
return (0);
error:
dsl_crypto_params_free(dcp, B_TRUE);
return (ret);
}
/*
* Unload a user's wrapping key from the kernel.
* Both innvl and outnvl are unused.
*/
/* ARGSUSED */
static int
zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int ret = 0;
if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
ret = (SET_ERROR(EINVAL));
goto out;
}
ret = spa_keystore_unload_wkey(dsname);
if (ret != 0)
goto out;
out:
return (ret);
}
/*
* Changes a user's wrapping key used to decrypt a dataset. The keyformat,
* keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
* here to change how the key is derived in userspace.
*
* innvl: {
* "hidden_args" (optional) -> { "wkeydata" -> value }
* raw uint8_t array of new encryption wrapping key data (32 bytes)
* "props" (optional) -> { prop -> value }
* }
*
* outnvl is unused
*/
/* ARGSUSED */
static int
zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
{
int ret;
uint64_t cmd = DCP_CMD_NONE;
dsl_crypto_params_t *dcp = NULL;
nvlist_t *args = NULL, *hidden_args = NULL;
if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
ret = (SET_ERROR(EINVAL));
goto error;
}
(void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd);
(void) nvlist_lookup_nvlist(innvl, "props", &args);
(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp);
if (ret != 0)
goto error;
ret = spa_keystore_change_key(dsname, dcp);
if (ret != 0)
goto error;
dsl_crypto_params_free(dcp, B_FALSE);
return (0);
error:
dsl_crypto_params_free(dcp, B_TRUE);
return (ret);
}
static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
static void
@@ -6099,6 +6291,16 @@ zfs_ioctl_init(void)
zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
zfs_ioc_load_key, zfs_secpolicy_load_key,
DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE);
zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
zfs_ioc_unload_key, zfs_secpolicy_load_key,
DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE);
zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
zfs_ioc_change_key, zfs_secpolicy_change_key,
DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
B_TRUE, B_TRUE);
zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
+7 -5
View File
@@ -1048,7 +1048,8 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
* We claim to always be readonly so we can open snapshots;
* other ZPL code will prevent us from writing to snapshots.
*/
error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, B_TRUE,
zfsvfs, &os);
if (error) {
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
@@ -1080,7 +1081,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
dmu_objset_disown(os, zfsvfs);
dmu_objset_disown(os, B_TRUE, zfsvfs);
*zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
@@ -1669,7 +1670,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
out:
if (error) {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
zfsvfs_free(zfsvfs);
/*
* make sure we don't have dangling sb->s_fs_info which
@@ -1729,7 +1730,8 @@ zfs_umount(struct super_block *sb)
zfsvfs_t *zfsvfs = sb->s_fs_info;
objset_t *os;
arc_remove_prune_callback(zfsvfs->z_arc_prune);
if (zfsvfs->z_arc_prune != NULL)
arc_remove_prune_callback(zfsvfs->z_arc_prune);
VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
os = zfsvfs->z_os;
zpl_bdi_destroy(sb);
@@ -1749,7 +1751,7 @@ zfs_umount(struct super_block *sb)
/*
* Finally release the objset
*/
dmu_objset_disown(os, zfsvfs);
dmu_objset_disown(os, B_TRUE, zfsvfs);
}
zfsvfs_free(zfsvfs);
+45 -21
View File
@@ -193,8 +193,8 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
* Read a log block and make sure it's valid.
*/
static int
zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
char **end)
zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
blkptr_t *nbp, void *dst, char **end)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
@@ -208,11 +208,14 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
zio_flags |= ZIO_FLAG_SPECULATIVE;
if (!decrypt)
zio_flags |= ZIO_FLAG_RAW;
SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
&abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
@@ -287,6 +290,14 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (zilog->zl_header->zh_claim_txg == 0)
zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
/*
* If we are not using the resulting data, we are just checking that
* it hasn't been corrupted so we don't need to waste CPU time
* decompressing and decrypting it.
*/
if (wbuf == NULL)
zio_flags |= ZIO_FLAG_RAW;
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
@@ -307,7 +318,8 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
*/
int
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
boolean_t decrypt)
{
const zil_header_t *zh = zilog->zl_header;
boolean_t claimed = !!zh->zh_claim_txg;
@@ -348,7 +360,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
if (blk_seq > claim_blk_seq)
break;
if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
error = parse_blk_func(zilog, &blk, arg, txg);
if (error != 0)
break;
ASSERT3U(max_blk_seq, <, blk_seq);
max_blk_seq = blk_seq;
@@ -357,7 +371,8 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
break;
error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
lrbuf, &end);
if (error != 0)
break;
@@ -367,7 +382,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
ASSERT3U(reclen, >=, sizeof (lr_t));
if (lr->lrc_seq > claim_lr_seq)
goto done;
if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
error = parse_lr_func(zilog, lr, arg, txg);
if (error != 0)
goto done;
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
@@ -382,7 +399,8 @@ done:
zilog->zl_parse_lr_count = lr_count;
ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
(decrypt && error == EIO));
zil_bp_tree_fini(zilog);
zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
@@ -423,9 +441,12 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
if (lr->lr_blkptr.blk_birth >= first_txg &&
(error = zil_read_log_data(zilog, lr, NULL)) != 0)
return (error);
if (lr->lr_blkptr.blk_birth >= first_txg) {
error = zil_read_log_data(zilog, lr, NULL);
if (error != 0)
return (error);
}
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
@@ -579,7 +600,7 @@ zil_create(zilog_t *zilog)
BP_ZERO(&blk);
}
error = zio_alloc_zil(zilog->zl_spa, txg, &blk,
error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
ZIL_MIN_BLKSZ, &slog);
fastwrite = TRUE;
@@ -673,7 +694,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
{
ASSERT(list_is_empty(&zilog->zl_lwb_list));
(void) zil_parse(zilog, zil_free_log_block,
zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
}
int
@@ -687,7 +708,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
int error;
error = dmu_objset_own_obj(dp, ds->ds_object,
DMU_OST_ANY, B_FALSE, FTAG, &os);
DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
if (error != 0) {
/*
* EBUSY indicates that the objset is inconsistent, in which
@@ -708,8 +729,10 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
if (!BP_IS_HOLE(&zh->zh_log))
zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
BP_ZERO(&zh->zh_log);
if (os->os_encrypted)
os->os_next_write_raw = B_TRUE;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_FALSE, FTAG);
return (0);
}
@@ -723,7 +746,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
ASSERT3U(zh->zh_claim_txg, <=, first_txg);
if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
(void) zil_parse(zilog, zil_claim_log_block,
zil_claim_log_record, tx, first_txg);
zil_claim_log_record, tx, first_txg, B_FALSE);
zh->zh_claim_txg = first_txg;
zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
@@ -734,7 +757,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
}
ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_FALSE, FTAG);
return (0);
}
@@ -792,7 +815,8 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
* which will update spa_max_claim_txg. See spa_load() for details.
*/
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa),
B_FALSE);
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
@@ -1060,7 +1084,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
BP_ZERO(bp);
error = zio_alloc_zil(spa, txg, bp, zil_blksz, &slog);
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
if (slog) {
ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
@@ -2269,7 +2293,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t replay_func[TX_MAX_TYPE])
zilog->zl_replay_time = ddi_get_lbolt();
ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg);
zh->zh_claim_txg, B_TRUE);
vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
zil_destroy(zilog, B_FALSE);
+330 -25
View File
@@ -43,6 +43,7 @@
#include <sys/time.h>
#include <sys/trace_zio.h>
#include <sys/abd.h>
#include <sys/dsl_crypt.h>
/*
* ==========================================================================
@@ -368,7 +369,7 @@ zio_pop_transforms(zio_t *zio)
/*
* ==========================================================================
* I/O transform callbacks for subblocks and decompression
* I/O transform callbacks for subblocks, decompression, and decryption
* ==========================================================================
*/
static void
@@ -394,6 +395,126 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
}
}
static void
zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
{
int ret;
void *tmp;
blkptr_t *bp = zio->io_bp;
uint64_t lsize = BP_GET_LSIZE(bp);
dmu_object_type_t ot = BP_GET_TYPE(bp);
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];
boolean_t no_crypt = B_FALSE;
ASSERT(BP_USES_CRYPT(bp));
ASSERT3U(size, !=, 0);
if (zio->io_error != 0)
return;
/*
* Verify the cksum of MACs stored in an indirect bp. It will always
* be possible to verify this since it does not require an encryption
* key.
*/
if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
zio_crypt_decode_mac_bp(bp, mac);
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
/*
* We haven't decompressed the data yet, but
* zio_crypt_do_indirect_mac_checksum() requires
* decompressed data to be able to parse out the MACs
* from the indirect block. We decompress it now and
* throw away the result after we are finished.
*/
tmp = zio_buf_alloc(lsize);
ret = zio_decompress_data(BP_GET_COMPRESS(bp),
zio->io_abd, tmp, zio->io_size, lsize);
if (ret != 0) {
ret = SET_ERROR(EIO);
goto error;
}
ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
zio_buf_free(tmp, lsize);
} else {
ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
}
abd_copy(data, zio->io_abd, size);
if (ret != 0)
goto error;
return;
}
/*
* If this is an authenticated block, just check the MAC. It would be
* nice to separate this out into its own flag, but for the moment
* enum zio_flag is out of bits.
*/
if (BP_IS_AUTHENTICATED(bp)) {
if (ot == DMU_OT_OBJSET) {
ret = spa_do_crypt_objset_mac_abd(B_FALSE, zio->io_spa,
zio->io_bookmark.zb_objset, zio->io_abd, size,
BP_SHOULD_BYTESWAP(bp));
} else {
zio_crypt_decode_mac_bp(bp, mac);
ret = spa_do_crypt_mac_abd(B_FALSE, zio->io_spa,
zio->io_bookmark.zb_objset, zio->io_abd, size, mac);
}
abd_copy(data, zio->io_abd, size);
if (ret != 0)
goto error;
return;
}
zio_crypt_decode_params_bp(bp, salt, iv);
if (ot == DMU_OT_INTENT_LOG) {
tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
zio_crypt_decode_mac_zil(tmp, mac);
abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
} else {
zio_crypt_decode_mac_bp(bp, mac);
}
ret = spa_do_crypt_abd(B_FALSE, zio->io_spa, zio->io_bookmark.zb_objset,
bp, bp->blk_birth, size, data, zio->io_abd, iv, mac, salt,
&no_crypt);
if (no_crypt)
abd_copy(data, zio->io_abd, size);
if (ret != 0)
goto error;
return;
error:
/* assert that the key was found unless this was speculative */
ASSERT(ret != ENOENT || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
/*
* If there was a decryption / authentication error return EIO as
* the io_error. If this was not a speculative zio, create an ereport.
*/
if (ret == ECKSUM) {
ret = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
zio->io_spa, NULL, &zio->io_bookmark, zio, 0, 0);
}
} else {
zio->io_error = ret;
}
}
/*
* ==========================================================================
* I/O parent/child relationships and pipeline interlocks
@@ -606,7 +727,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
@@ -844,9 +965,12 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
* Data can be NULL if we are going to call zio_write_override() to
* provide the already-allocated BP. But we may need the data to
* verify a dedup hit (if requested). In this case, don't try to
* dedup (just take the already-allocated BP verbatim).
* dedup (just take the already-allocated BP verbatim). Encrypted
* dedup blocks need data as well so we also disable dedup in this
* case.
*/
if (data == NULL && zio->io_prop.zp_dedup_verify) {
if (data == NULL &&
(zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
}
@@ -1186,16 +1310,23 @@ static int
zio_read_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
psize, psize, zio_decompress);
}
if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
zio->io_child_type == ZIO_CHILD_LOGICAL) {
zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
psize, psize, zio_decrypt);
}
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
int psize = BPE_GET_PSIZE(bp);
void *data = abd_borrow_buf(zio->io_abd, psize);
@@ -1222,7 +1353,6 @@ zio_read_bp_init(zio_t *zio)
static int
zio_write_bp_init(zio_t *zio)
{
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
@@ -1261,7 +1391,8 @@ zio_write_bp_init(zio_t *zio)
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
!zp->zp_encrypt) {
BP_SET_DEDUP(bp, 1);
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
@@ -1290,8 +1421,6 @@ zio_write_compress(zio_t *zio)
uint64_t psize = zio->io_size;
int pass = 1;
EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
@@ -1341,13 +1470,15 @@ zio_write_compress(zio_t *zio)
}
/* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
if (compress != ZIO_COMPRESS_OFF &&
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
} else if (!zp->zp_dedup && !zp->zp_encrypt &&
psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
encode_embedded_bp_compressed(bp,
@@ -1445,6 +1576,8 @@ zio_write_compress(zio_t *zio)
if (zp->zp_dedup) {
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
ASSERT(!zp->zp_encrypt ||
DMU_OT_IS_ENCRYPTED(zp->zp_type));
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
}
if (zp->zp_nopwrite) {
@@ -1868,7 +2001,8 @@ zio_suspend(spa_t *spa, zio_t *zio)
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0, 0);
mutex_enter(&spa->spa_suspend_lock);
@@ -2298,11 +2432,19 @@ zio_write_gang_block(zio_t *pio)
uint64_t resid = pio->io_size;
uint64_t lsize;
int copies = gio->io_prop.zp_copies;
int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
int gbh_copies;
zio_prop_t zp;
int g, error;
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
/*
* encrypted blocks need DVA[2] free so encrypted gang headers can't
* have a third copy.
*/
gbh_copies = MIN(copies + 1, spa_max_replication(spa));
if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
gbh_copies = SPA_DVAS_PER_BP - 1;
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
@@ -2376,12 +2518,16 @@ zio_write_gang_block(zio_t *pio)
zp.zp_checksum = gio->io_prop.zp_checksum;
zp.zp_compress = ZIO_COMPRESS_OFF;
zp.zp_encrypt = gio->io_prop.zp_encrypt;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
@@ -2460,6 +2606,7 @@ zio_nop_write(zio_t *zio)
if (BP_IS_HOLE(bp_orig) ||
!(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) ||
BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
@@ -2609,7 +2756,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
* pushed the I/O transforms. That's an important optimization
* because otherwise we'd compress/encrypt all dmu_sync() data twice.
* However, we should never get a raw, override zio so in these
* cases we can compare the io_data directly. This is useful because
* cases we can compare the io_abd directly. This is useful because
* it allows us to do dedup verification even if we don't have access
* to the original data (for instance, if the encryption keys aren't
* loaded).
@@ -3097,8 +3244,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
boolean_t *slog)
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t *slog)
{
int error = 1;
zio_alloc_list_t io_alloc_list;
@@ -3130,6 +3277,23 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
BP_SET_LEVEL(new_bp, 0);
BP_SET_DEDUP(new_bp, 0);
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
/*
* encrypted blocks will require an IV and salt. We generate
* these now since we will not be rewriting the bp at
* rewrite time.
*/
if (os->os_encrypted) {
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t salt[ZIO_DATA_SALT_LEN];
BP_SET_CRYPT(new_bp, B_TRUE);
VERIFY0(spa_crypt_get_salt(spa,
dmu_objset_id(os), salt));
VERIFY0(zio_crypt_generate_iv(iv));
zio_crypt_encode_params_bp(new_bp, salt, iv);
}
}
return (error);
@@ -3462,6 +3626,146 @@ zio_vdev_io_bypass(zio_t *zio)
zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
}
/*
* ==========================================================================
* Encrypt and store encryption parameters
* ==========================================================================
*/
/*
* This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
* managing the storage of encryption parameters and passing them to the
* lower-level encryption functions.
*/
static int
zio_encrypt(zio_t *zio)
{
zio_prop_t *zp = &zio->io_prop;
spa_t *spa = zio->io_spa;
blkptr_t *bp = zio->io_bp;
uint64_t psize = BP_GET_PSIZE(bp);
dmu_object_type_t ot = BP_GET_TYPE(bp);
void *enc_buf = NULL;
abd_t *eabd = NULL;
uint8_t salt[ZIO_DATA_SALT_LEN];
uint8_t iv[ZIO_DATA_IV_LEN];
uint8_t mac[ZIO_DATA_MAC_LEN];
boolean_t no_crypt = B_FALSE;
/* the root zio already encrypted the data */
if (zio->io_child_type == ZIO_CHILD_GANG)
return (ZIO_PIPELINE_CONTINUE);
/* only ZIL blocks are re-encrypted on rewrite */
if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
return (ZIO_PIPELINE_CONTINUE);
if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
BP_SET_CRYPT(bp, B_FALSE);
return (ZIO_PIPELINE_CONTINUE);
}
/* if we are doing raw encryption set the provided encryption params */
if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
BP_SET_CRYPT(bp, B_TRUE);
BP_SET_BYTEORDER(bp, zp->zp_byteorder);
if (ot != DMU_OT_OBJSET)
zio_crypt_encode_mac_bp(bp, zp->zp_mac);
if (DMU_OT_IS_ENCRYPTED(ot))
zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
return (ZIO_PIPELINE_CONTINUE);
}
/* indirect blocks only maintain a cksum of the lower level MACs */
if (BP_GET_LEVEL(bp) > 0) {
BP_SET_CRYPT(bp, B_TRUE);
VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
mac));
zio_crypt_encode_mac_bp(bp, mac);
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Objset blocks are a special case since they have 2 256-bit MACs
* embedded within them.
*/
if (ot == DMU_OT_OBJSET) {
ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
BP_SET_CRYPT(bp, B_TRUE);
VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa,
zio->io_bookmark.zb_objset, zio->io_abd, psize,
BP_SHOULD_BYTESWAP(bp)));
return (ZIO_PIPELINE_CONTINUE);
}
/* unencrypted object types are only authenticated with a MAC */
if (!DMU_OT_IS_ENCRYPTED(ot)) {
BP_SET_CRYPT(bp, B_TRUE);
VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa,
zio->io_bookmark.zb_objset, zio->io_abd, psize, mac));
zio_crypt_encode_mac_bp(bp, mac);
return (ZIO_PIPELINE_CONTINUE);
}
/*
* Later passes of sync-to-convergence may decide to rewrite data
* in place to avoid more disk reallocations. This presents a problem
* for encryption because this consitutes rewriting the new data with
* the same encryption key and IV. However, this only applies to blocks
* in the MOS (particularly the spacemaps) and we do not encrypt the
* MOS. We assert that the zio is allocating or an intent log write
* to enforce this.
*/
ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
ASSERT3U(psize, !=, 0);
enc_buf = zio_buf_alloc(psize);
eabd = abd_get_from_buf(enc_buf, psize);
abd_take_ownership_of_buf(eabd, B_TRUE);
/*
* For an explanation of what encryption parameters are stored
* where, see the block comment in zio_crypt.c.
*/
if (ot == DMU_OT_INTENT_LOG) {
zio_crypt_decode_params_bp(bp, salt, iv);
} else {
BP_SET_CRYPT(bp, B_TRUE);
}
/* Perform the encryption. This should not fail */
VERIFY0(spa_do_crypt_abd(B_TRUE, spa, zio->io_bookmark.zb_objset, bp,
zio->io_txg, psize, zio->io_abd, eabd, iv, mac, salt, &no_crypt));
/* encode encryption metadata into the bp */
if (ot == DMU_OT_INTENT_LOG) {
/*
* ZIL blocks store the MAC in the embedded checksum, so the
* transform must always be applied.
*/
zio_crypt_encode_mac_zil(enc_buf, mac);
zio_push_transform(zio, eabd, psize, psize, NULL);
} else {
BP_SET_CRYPT(bp, B_TRUE);
zio_crypt_encode_params_bp(bp, salt, iv);
zio_crypt_encode_mac_bp(bp, mac);
if (no_crypt) {
ASSERT3U(ot, ==, DMU_OT_DNODE);
abd_free(eabd);
} else {
zio_push_transform(zio, eabd, psize, psize, NULL);
}
}
return (ZIO_PIPELINE_CONTINUE);
}
/*
* ==========================================================================
* Generate and verify checksums
@@ -3523,8 +3827,8 @@ zio_checksum_verify(zio_t *zio)
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, zio, zio->io_offset,
zio->io_size, NULL, &info);
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, NULL, &info);
}
}
@@ -3824,7 +4128,7 @@ zio_done(zio_t *zio)
if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) {
if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
zio->io_vd, zio, 0, 0);
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
}
if (zio->io_error) {
@@ -3837,7 +4141,7 @@ zio_done(zio_t *zio)
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd))
zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
zio->io_vd, zio, 0, 0);
zio->io_vd, &zio->io_bookmark, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
@@ -3846,9 +4150,9 @@ zio_done(zio_t *zio)
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
spa_log_error(zio->io_spa, zio);
spa_log_error(zio->io_spa, &zio->io_bookmark);
zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
NULL, zio, 0, 0);
NULL, &zio->io_bookmark, zio, 0, 0);
}
}
@@ -4046,6 +4350,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_free_bp_init,
zio_issue_async,
zio_write_compress,
zio_encrypt,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
+56 -9
View File
@@ -308,6 +308,25 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
mutex_exit(&spa->spa_cksum_tmpls_lock);
}
/* convenience function to update a checksum to accomodate an encryption MAC */
static void
zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
{
/*
* Weak checksums do not have their entropy spread evenly
* across the bits of the checksum. Therefore, when truncating
* a weak checksum we XOR the first 2 words with the last 2 so
* that we don't "lose" any entropy unnecessarily.
*/
if (xor) {
cksum->zc_word[0] ^= cksum->zc_word[2];
cksum->zc_word[1] ^= cksum->zc_word[3];
}
cksum->zc_word[2] = saved->zc_word[2];
cksum->zc_word[3] = saved->zc_word[3];
}
/*
* Generate the checksum.
*/
@@ -319,8 +338,9 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t cksum;
zio_cksum_t cksum, saved;
spa_t *spa = zio->io_spa;
boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
@@ -331,6 +351,8 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
zio_eck_t eck;
size_t eck_offset;
bzero(&saved, sizeof (zio_cksum_t));
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t zilc;
abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
@@ -347,31 +369,36 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
zio_checksum_gang_verifier(&eck.zec_cksum, bp);
abd_copy_from_buf_off(abd, &eck.zec_cksum,
eck_offset + offsetof(zio_eck_t, zec_cksum),
sizeof (zio_cksum_t));
} else if (checksum == ZIO_CHECKSUM_LABEL) {
zio_checksum_label_verifier(&eck.zec_cksum, offset);
abd_copy_from_buf_off(abd, &eck.zec_cksum,
eck_offset + offsetof(zio_eck_t, zec_cksum),
sizeof (zio_cksum_t));
} else {
bp->blk_cksum = eck.zec_cksum;
saved = eck.zec_cksum;
eck.zec_cksum = bp->blk_cksum;
}
abd_copy_from_buf_off(abd, &zec_magic,
eck_offset + offsetof(zio_eck_t, zec_magic),
sizeof (zec_magic));
abd_copy_from_buf_off(abd, &eck.zec_cksum,
eck_offset + offsetof(zio_eck_t, zec_cksum),
sizeof (zio_cksum_t));
ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&cksum);
if (bp != NULL && BP_USES_CRYPT(bp) &&
BP_GET_TYPE(bp) != DMU_OT_OBJSET)
zio_checksum_handle_crypt(&cksum, &saved, insecure);
abd_copy_from_buf_off(abd, &cksum,
eck_offset + offsetof(zio_eck_t, zec_cksum),
sizeof (zio_cksum_t));
} else {
saved = bp->blk_cksum;
ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&bp->blk_cksum);
&cksum);
if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
zio_checksum_handle_crypt(&cksum, &saved, insecure);
bp->blk_cksum = cksum;
}
}
@@ -458,6 +485,26 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
/*
* MAC checksums are a special case since half of this checksum will
* actually be the encryption MAC. This will be verified by the
* decryption process, so we just check the truncated checksum now.
* Objset blocks use embedded MACs so we don't truncate the checksum
* for them.
*/
if (bp != NULL && BP_USES_CRYPT(bp) &&
BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
}
actual_cksum.zc_word[2] = 0;
actual_cksum.zc_word[3] = 0;
expected_cksum.zc_word[2] = 0;
expected_cksum.zc_word[3] = 0;
}
if (info != NULL) {
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
File diff suppressed because it is too large Load Diff
+10 -10
View File
@@ -451,7 +451,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (zv == NULL || zv->zv_objset == NULL) {
if (zv != NULL)
rw_exit(&zv->zv_suspend_lock);
if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
FTAG, &os)) != 0) {
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
@@ -478,7 +478,7 @@ out:
kmem_free(doi, sizeof (dmu_object_info_t));
if (owned) {
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_TRUE, FTAG);
if (zv != NULL)
zv->zv_objset = NULL;
} else {
@@ -1268,7 +1268,7 @@ zvol_first_open(zvol_state_t *zv)
}
/* lie and say we're read-only */
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os);
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, 1, zv, &os);
if (error)
goto out_mutex;
@@ -1277,7 +1277,7 @@ zvol_first_open(zvol_state_t *zv)
error = zvol_setup_zv(zv);
if (error) {
dmu_objset_disown(os, zv);
dmu_objset_disown(os, 1, zv);
zv->zv_objset = NULL;
}
@@ -1295,7 +1295,7 @@ zvol_last_close(zvol_state_t *zv)
zvol_shutdown_zv(zv);
dmu_objset_disown(zv->zv_objset, zv);
dmu_objset_disown(zv->zv_objset, 1, zv);
zv->zv_objset = NULL;
}
@@ -1756,7 +1756,7 @@ zvol_create_minor_impl(const char *name)
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
if (error)
goto out_doi;
@@ -1822,7 +1822,7 @@ zvol_create_minor_impl(const char *name)
zv->zv_objset = NULL;
out_dmu_objset_disown:
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_TRUE, FTAG);
out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
@@ -1887,11 +1887,11 @@ zvol_prefetch_minors_impl(void *arg)
char *dsname = job->name;
objset_t *os = NULL;
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG,
&os);
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
FTAG, &os);
if (job->error == 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_objset_disown(os, FTAG);
dmu_objset_disown(os, B_TRUE, FTAG);
}
}
+3 -3
View File
@@ -210,14 +210,14 @@ zpios_dmu_setup(run_args_t *run_args)
t->start = zpios_timespec_now();
(void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id);
rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL);
rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL);
if (rc) {
zpios_print(run_args->file, "Error dmu_objset_create(%s, ...) "
"failed: %d\n", name, rc);
goto out;
}
rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os);
rc = dmu_objset_own(name, DMU_OST_OTHER, 0, 1, zpios_tag, &os);
if (rc) {
zpios_print(run_args->file, "Error dmu_objset_own(%s, ...) "
"failed: %d\n", name, rc);
@@ -429,7 +429,7 @@ zpios_remove_objset(run_args_t *run_args)
}
}
dmu_objset_disown(run_args->os, zpios_tag);
dmu_objset_disown(run_args->os, 1, zpios_tag);
if (run_args->flags & DMU_REMOVE) {
rc = dsl_destroy_head(name);