feature: large_microzap

In a4b21eadec we added the zap_micro_max_size tuneable to raise the size
at which "micro" (single-block) ZAPs are upgraded to "fat" (multi-block)
ZAPs. Before this, a microZAP was limited to 128KiB, which was the old
largest block size. The side effect of raising the max size past 128KiB
is that it be stored in a large block, requiring the large_blocks
feature.

Unfortunately, this means that a backup stream created without the
--large-block (-L) flag to zfs send would split the microZAP block into
smaller blocks and send those, as is normal behaviour for large blocks.
This would be received correctly, but since microZAPs are limited to the
first block in the object by definition, the entries in the later blocks
would be inaccessible. For directory ZAPs, this gives the appearance of
files being lost.

This commit adds a feature flag, large_microzap, that must be enabled
for microZAPs to grow beyond 128KiB, and which will be activated the
first time that occurs. This feature is later checked when generating
the stream and if active, the send operation will abort unless
--large-block has also been requested.

Changing the limit still requires zap_micro_max_size to be changed. The
state of this flag effectively sets the upper value for this tuneable,
that is, if the feature is disabled, the tuneable will be clamped to
128KiB.

A stream flag is also added to ensure that the receiver also activates
its own feature flag upon receiving the stream. This is not strictly
necessary to _use_ the received microZAP, since it doesn't care how
large its block is, but it is required to send the microZAP object on,
otherwise the original problem occurs again.

Because it's difficult to reliably distinguish a microZAP from a fatZAP
from outside the ZAP code, and because it seems unlikely that most
users are affected (a fairly niche tuneable combined with what should be
an uncommon use of send), and for the sake of expediency, this change
activates the feature the first time a microZAP grows to use a large
block, and is never deactivated after that. This can be improved in the
future.

This commit changes nothing for existing pools that already have large
microZAPs. The feature will not be retroactively applied, but will be
activated the next time a microZAP grows past the limit.

Don't use large_blocks feature for enable/disable tests.  The
large_microzap depends on large_blocks, so it gets enabled as a
dependency, breaking the test. Instead use feature "longname", which has
the exact same feature characteristics.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16593
This commit is contained in:
Rob Norris 2024-10-03 13:47:11 +10:00 committed by GitHub
parent 412105977c
commit 224393a321
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 162 additions and 22 deletions

View File

@ -30,6 +30,7 @@
* Portions Copyright 2010 Robert Milkowski
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
* Copyright (c) 2024, Klara, Inc.
*/
#ifndef _SYS_FS_ZFS_H
@ -1631,6 +1632,7 @@ typedef enum {
ZFS_ERR_CRYPTO_NOTSUP,
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_STREAM_LARGE_MICROZAP,
} zfs_errno_t;
/*

View File

@ -24,6 +24,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2024, Klara, Inc.
*/
#ifndef _SYS_ZAP_IMPL_H
@ -45,7 +46,6 @@ extern int fzap_default_block_shift;
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
#define ZAP_NEED_CD (-1U)
@ -210,6 +210,8 @@ int zap_hashbits(zap_t *zap);
uint32_t zap_maxcd(zap_t *zap);
uint64_t zap_getflags(zap_t *zap);
uint64_t zap_get_micro_max_size(spa_t *spa);
#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
void fzap_byteswap(void *buf, size_t size);

View File

@ -23,6 +23,7 @@
* Copyright (c) 2012, 2024 by Delphix. All rights reserved.
* Copyright 2016 RackTop Systems.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2024, Klara, Inc.
*/
#ifndef _SYS_ZFS_IOCTL_H
@ -145,6 +146,7 @@ typedef enum drr_headertype {
*/
#define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27)
#define DMU_BACKUP_FEATURE_LONGNAME (1 << 28)
#define DMU_BACKUP_FEATURE_LARGE_MICROZAP (1 << 29)
/*
* Mask of all supported backup features
@ -155,7 +157,8 @@ typedef enum drr_headertype {
DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \
DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \
DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \
DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_LONGNAME)
DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_LONGNAME | \
DMU_BACKUP_FEATURE_LARGE_MICROZAP)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))

View File

@ -24,6 +24,7 @@
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2024, Klara, Inc.
*/
#ifndef _ZFEATURE_COMMON_H
@ -84,6 +85,7 @@ typedef enum spa_feature {
SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURE_LONGNAME,
SPA_FEATURE_LARGE_MICROZAP,
SPA_FEATURES
} spa_feature_t;

View File

@ -629,7 +629,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2408' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2464' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -6194,7 +6194,8 @@
<enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
<enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
<enumerator name='SPA_FEATURE_LONGNAME' value='42'/>
<enumerator name='SPA_FEATURES' value='43'/>
<enumerator name='SPA_FEATURE_LARGE_MICROZAP' value='43'/>
<enumerator name='SPA_FEATURES' value='44'/>
</enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@ -9373,8 +9374,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='19264' id='bd39d632'>
<subrange length='43' type-id='7359adad' id='8f7e73a2'/>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='19712' id='fd4573e5'>
<subrange length='44' type-id='7359adad' id='cf8ba455'/>
</array-type-def>
<enum-decl name='zfeature_flags' id='6db816a4'>
<underlying-type type-id='9cac1fee'/>
@ -9451,7 +9452,7 @@
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
<var-decl name='spa_feature_table' type-id='bd39d632' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='spa_feature_table' type-id='fd4573e5' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>

View File

@ -30,6 +30,7 @@
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2019 Datto Inc.
* Copyright (c) 2024, Klara, Inc.
*/
#include <assert.h>
@ -2828,7 +2829,12 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
case EROFS:
zfs_error_aux(hdl, "%s", zfs_strerror(errno));
return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
case ZFS_ERR_STREAM_LARGE_MICROZAP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"source snapshot contains large microzaps, "
"need -L (--large-block) or -w (--raw) to "
"generate stream"));
return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
default:
return (zfs_standard_error(hdl, errno, errbuf));
}

View File

@ -16,7 +16,9 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd June 27, 2024
.\" Copyright (c) 2024, Klara, Inc.
.\"
.Dd October 2, 2024
.Dt ZFS 4
.Os
.
@ -614,7 +616,11 @@ However, this is limited by
.
.It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int
Maximum micro ZAP size.
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
A "micro" ZAP is upgraded to a "fat" ZAP once it grows beyond the specified
size.
Sizes higher than 128KiB will be clamped to 128KiB unless the
.Sy large_microzap
feature is enabled.
.
.It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
If set, adjacent empty ZAP blocks will be collapsed, reducing disk space.

View File

@ -14,12 +14,11 @@
.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\" Copyright (c) 2019, Klara Inc.
.\" Copyright (c) 2019, 2023, 2024, Klara, Inc.
.\" Copyright (c) 2019, Allan Jude
.\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
.\" Copyright (c) 2023, Klara Inc.
.\"
.Dd February 14, 2024
.Dd October 2, 2024
.Dt ZPOOL-FEATURES 7
.Os
.
@ -706,6 +705,24 @@ are destroyed.
Large dnodes allow more data to be stored in the bonus buffer,
thus potentially improving performance by avoiding the use of spill blocks.
.
.feature com.klarasystems large_microzap yes extensible_dataset large_blocks
This feature allows "micro" ZAPs to grow larger than 128 KiB without being
upgraded to "fat" ZAPs.
.Pp
This feature becomes
.Sy active
the first time a micro ZAP grows larger than 128KiB.
It will only be returned to the
.Sy enabled
state when all datasets that ever had a large micro ZAP are destroyed.
.Pp
Note that even when this feature is enabled, micro ZAPs cannot grow larger
than 128 KiB without also changing the
.Sy zap_micro_max_size
module parameter.
See
.Xr zfs 4 .
.
.feature com.delphix livelist yes extensible_dataset
This feature allows clones to be deleted faster than the traditional method
when a large number of random/sparse writes have been made to the clone.

View File

@ -28,8 +28,9 @@
.\" Copyright 2019 Richard Laager. All rights reserved.
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright 2019 Joyent, Inc.
.\" Copyright (c) 2024, Klara, Inc.
.\"
.Dd July 27, 2023
.Dd October 2, 2024
.Dt ZFS-SEND 8
.Os
.
@ -111,6 +112,9 @@ property of this filesystem has never been set above 128 KiB.
The receiving system must have the
.Sy large_blocks
pool feature enabled as well.
This flag is required if the
.Sy large_microzap
pool feature is active.
See
.Xr zpool-features 7
for details on ZFS feature flags and the

View File

@ -25,7 +25,7 @@
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, 2024, Klara, Inc.
* Copyright (c) 2019, Allan Jude
*/
@ -772,6 +772,19 @@ zpool_feature_init(void)
longname_deps, sfeatures);
}
{
static const spa_feature_t large_microzap_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_LARGE_BLOCKS,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_LARGE_MICROZAP,
"com.klarasystems:large_microzap", "large_microzap",
"Support for microzaps larger than 128KB.",
ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT,
ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures);
}
zfs_mod_list_supported_free(sfeatures);
}

View File

@ -25,7 +25,7 @@
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, 2024, Klara, Inc.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2019 Datto Inc.
* Copyright (c) 2022 Axcient.
@ -593,6 +593,9 @@ recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
return (SET_ERROR(ENOTSUP));
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) &&
!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
return (SET_ERROR(ENOTSUP));
/*
* Receiving redacted streams requires that redacted datasets are
@ -994,6 +997,24 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
numredactsnaps, tx);
}
if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) {
/*
* The source has seen a large microzap at least once in its
* life, so we activate the feature here to match. It's not
* strictly necessary since a large microzap is usable without
* the feature active, but if that object is sent on from here,
* we need this info to know to add the stream feature.
*
* There may be no large microzap in the incoming stream, or
* ever again, but this is a very niche feature and its very
* difficult to spot a large microzap in the stream, so its
* not worth the effort of trying harder to activate the
* feature at first use.
*/
dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP,
(void *)B_TRUE, tx);
}
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;

View File

@ -26,7 +26,7 @@
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright 2016 RackTop Systems.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, 2024, Klara, Inc.
* Copyright (c) 2019, Allan Jude
*/
@ -2015,6 +2015,17 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LONGNAME)) {
*featureflags |= DMU_BACKUP_FEATURE_LONGNAME;
}
if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_MICROZAP)) {
/*
* We must never split a large microzap block, so we can only
* send large microzaps if LARGE_BLOCKS is already enabled.
*/
if (!(*featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ZFS_ERR_STREAM_LARGE_MICROZAP));
*featureflags |= DMU_BACKUP_FEATURE_LARGE_MICROZAP;
}
return (0);
}

View File

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara, Inc.
*/
#include <sys/dmu.h>
@ -575,7 +576,6 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
dmu_tx_t *tx = txh->txh_tx;
dnode_t *dn = txh->txh_dnode;
int err;
extern int zap_micro_max_size;
ASSERT(tx->tx_txg == 0);
@ -591,7 +591,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
* - 2 grown ptrtbl blocks
*/
(void) zfs_refcount_add_many(&txh->txh_space_towrite,
zap_micro_max_size, FTAG);
zap_get_micro_max_size(tx->tx_pool->dp_spa), FTAG);
if (dn == NULL)
return;

View File

@ -24,6 +24,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2024, Klara, Inc.
*/
#include <sys/zio.h>
@ -36,12 +37,37 @@
#include <sys/btree.h>
#include <sys/arc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_impl.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
#endif
int zap_micro_max_size = MZAP_MAX_BLKSZ;
/*
* The maximum size (in bytes) of a microzap before it is converted to a
* fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
*
* By definition, a microzap must fit into a single block, so this has
* traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
* Setting this higher requires both the large_blocks feature (to even create
* blocks that large) and the large_microzap feature (to enable the stream
* machinery to understand not to try to split a microzap block).
*
* If large_microzap is enabled, this value will be clamped to
* spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE.
*/
static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
uint64_t
zap_get_micro_max_size(spa_t *spa)
{
uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE);
if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
return (maxsz);
if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
return (MIN(maxsz, spa_maxblocksize(spa)));
return (SPA_OLD_MAXBLOCKSIZE);
}
static int mzap_upgrade(zap_t **zapp,
const void *tag, dmu_tx_t *tx, zap_flags_t flags);
@ -638,7 +664,7 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
if (zap->zap_ismicro && tx && adding &&
zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
if (newsz > zap_micro_max_size) {
if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
dprintf("upgrading obj %llu: num_entries=%u\n",
(u_longlong_t)obj, zap->zap_m.zap_num_entries);
*zapp = zap;
@ -650,6 +676,31 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
zap->zap_m.zap_num_chunks =
db->db_size / MZAP_ENT_LEN - 1;
if (newsz > SPA_OLD_MAXBLOCKSIZE) {
dsl_dataset_t *ds = dmu_objset_ds(os);
if (!dsl_dataset_feature_is_active(ds,
SPA_FEATURE_LARGE_MICROZAP)) {
/*
* A microzap just grew beyond the old limit
* for the first time, so we have to ensure the
* feature flag is activated.
* zap_get_micro_max_size() won't let us get
* here if the feature is not enabled, so we
* don't need any other checks beforehand.
*
* Since we're in open context, we can't
* activate the feature directly, so we instead
* flag it on the dataset for next sync.
*/
dsl_dataset_dirty(ds, tx);
mutex_enter(&ds->ds_lock);
ds->ds_feature_activation
[SPA_FEATURE_LARGE_MICROZAP] =
(void *)B_TRUE;
mutex_exit(&ds->ds_lock);
}
}
}
*zapp = zap;

View File

@ -75,8 +75,8 @@ log_onexit cleanup
# excluded because other features depend on them.
set -A features \
"hole_birth" \
"large_blocks" \
"large_dnode" \
"longname" \
"userobj_accounting"
typeset -i i=0

View File

@ -111,5 +111,6 @@ if is_linux || is_freebsd; then
"feature@raidz_expansion"
"feature@fast_dedup"
"feature@longname"
"feature@large_microzap"
)
fi