diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index cd2496bf7..1676020d0 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -30,6 +30,7 @@ * Portions Copyright 2010 Robert Milkowski * Copyright (c) 2021, Colm Buckley * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_FS_ZFS_H @@ -1631,6 +1632,7 @@ typedef enum { ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, + ZFS_ERR_STREAM_LARGE_MICROZAP, } zfs_errno_t; /* diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 0c72c6881..fad2c8bfa 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -24,6 +24,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_ZAP_IMPL_H @@ -45,7 +46,6 @@ extern int fzap_default_block_shift; #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) -#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE #define ZAP_NEED_CD (-1U) @@ -210,6 +210,8 @@ int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); uint64_t zap_getflags(zap_t *zap); +uint64_t zap_get_micro_max_size(spa_t *spa); + #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) void fzap_byteswap(void *buf, size_t size); diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 470b2ed5f..aa20e52a7 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_ZFS_IOCTL_H @@ -145,6 +146,7 @@ typedef enum drr_headertype { */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) #define DMU_BACKUP_FEATURE_LONGNAME (1 << 28) +#define DMU_BACKUP_FEATURE_LARGE_MICROZAP (1 << 29) /* * Mask of all supported backup features @@ -155,7 +157,8 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ - DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_LONGNAME) + DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_LONGNAME | \ + DMU_BACKUP_FEATURE_LARGE_MICROZAP) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 1ca122d30..ac42b5c0c 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -24,6 +24,7 @@ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _ZFEATURE_COMMON_H @@ -84,6 +85,7 @@ typedef enum spa_feature { SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, + SPA_FEATURE_LARGE_MICROZAP, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 782192eb8..1a96460c2 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -629,7 +629,7 @@ - + @@ -6194,7 +6194,8 @@ - + + @@ -9373,8 +9374,8 @@ - - + + @@ -9451,7 +9452,7 @@ - + diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index ee01ee9b2..b9780720e 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -30,6 +30,7 @@ * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -2828,7 +2829,12 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, case EROFS: zfs_error_aux(hdl, "%s", zfs_strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - + case ZFS_ERR_STREAM_LARGE_MICROZAP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "source snapshot contains large microzaps, " + "need -L (--large-block) or -w (--raw) to " + "generate stream")); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5a47cbbe2..cf6720317 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -16,7 +16,9 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 27, 2024 +.\" Copyright (c) 2024, Klara, Inc. +.\" +.Dd October 2, 2024 .Dt ZFS 4 .Os . @@ -614,7 +616,11 @@ However, this is limited by . .It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int Maximum micro ZAP size. -A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. +A "micro" ZAP is upgraded to a "fat" ZAP once it grows beyond the specified +size. +Sizes higher than 128KiB will be clamped to 128KiB unless the +.Sy large_microzap +feature is enabled. . .It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, adjacent empty ZAP blocks will be collapsed, reducing disk space. diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ad9755ba5..7b392a896 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -14,12 +14,11 @@ .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] -.\" Copyright (c) 2019, Klara Inc. +.\" Copyright (c) 2019, 2023, 2024, Klara, Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley -.\" Copyright (c) 2023, Klara Inc. .\" -.Dd February 14, 2024 +.Dd October 2, 2024 .Dt ZPOOL-FEATURES 7 .Os . @@ -706,6 +705,24 @@ are destroyed. Large dnodes allow more data to be stored in the bonus buffer, thus potentially improving performance by avoiding the use of spill blocks. . +.feature com.klarasystems large_microzap yes extensible_dataset large_blocks +This feature allows "micro" ZAPs to grow larger than 128 KiB without being +upgraded to "fat" ZAPs. +.Pp +This feature becomes +.Sy active +the first time a micro ZAP grows larger than 128KiB. +It will only be returned to the +.Sy enabled +state when all datasets that ever had a large micro ZAP are destroyed. +.Pp +Note that even when this feature is enabled, micro ZAPs cannot grow larger +than 128 KiB without also changing the +.Sy zap_micro_max_size +module parameter. +See +.Xr zfs 4 . +. .feature com.delphix livelist yes extensible_dataset This feature allows clones to be deleted faster than the traditional method when a large number of random/sparse writes have been made to the clone. diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 index ba604bf77..877d95414 100644 --- a/man/man8/zfs-send.8 +++ b/man/man8/zfs-send.8 @@ -28,8 +28,9 @@ .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd July 27, 2023 +.Dd October 2, 2024 .Dt ZFS-SEND 8 .Os . @@ -111,6 +112,9 @@ property of this filesystem has never been set above 128 KiB. The receiving system must have the .Sy large_blocks pool feature enabled as well. +This flag is required if the +.Sy large_microzap +pool feature is active. See .Xr zpool-features 7 for details on ZFS feature flags and the diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 881deb5bf..96f0086d7 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -25,7 +25,7 @@ * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ @@ -772,6 +772,19 @@ zpool_feature_init(void) longname_deps, sfeatures); } + { + static const spa_feature_t large_microzap_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_LARGE_BLOCKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_MICROZAP, + "com.klarasystems:large_microzap", "large_microzap", + "Support for microzaps larger than 128KB.", + ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, + ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 4877eb7e6..b1cd981ce 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -25,7 +25,7 @@ * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019 Datto Inc. * Copyright (c) 2022 Axcient. @@ -593,6 +593,9 @@ recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) + return (SET_ERROR(ENOTSUP)); /* * Receiving redacted streams requires that redacted datasets are @@ -994,6 +997,24 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) numredactsnaps, tx); } + if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) { + /* + * The source has seen a large microzap at least once in its + * life, so we activate the feature here to match. It's not + * strictly necessary since a large microzap is usable without + * the feature active, but if that object is sent on from here, + * we need this info to know to add the stream feature. + * + * There may be no large microzap in the incoming stream, or + * ever again, but this is a very niche feature and its very + * difficult to spot a large microzap in the stream, so its + * not worth the effort of trying harder to activate the + * feature at first use. + */ + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP, + (void *)B_TRUE, tx); + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index c7d3a5cb6..a174972e9 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -26,7 +26,7 @@ * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ @@ -2015,6 +2015,17 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LONGNAME)) { *featureflags |= DMU_BACKUP_FEATURE_LONGNAME; } + + if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_MICROZAP)) { + /* + * We must never split a large microzap block, so we can only + * send large microzaps if LARGE_BLOCKS is already enabled. + */ + if (!(*featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_MICROZAP)); + *featureflags |= DMU_BACKUP_FEATURE_LARGE_MICROZAP; + } + return (0); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 2c2a6c764..3fdcebdff 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -575,7 +576,6 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; - extern int zap_micro_max_size; ASSERT(tx->tx_txg == 0); @@ -591,7 +591,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) * - 2 grown ptrtbl blocks */ (void) zfs_refcount_add_many(&txh->txh_space_towrite, - zap_micro_max_size, FTAG); + zap_get_micro_max_size(tx->tx_pool->dp_spa), FTAG); if (dn == NULL) return; diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index a428a040a..12938022e 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -24,6 +24,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -36,12 +37,37 @@ #include #include #include +#include #ifdef _KERNEL #include #endif -int zap_micro_max_size = MZAP_MAX_BLKSZ; +/* + * The maximum size (in bytes) of a microzap before it is converted to a + * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE). + * + * By definition, a microzap must fit into a single block, so this has + * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default. + * Setting this higher requires both the large_blocks feature (to even create + * blocks that large) and the large_microzap feature (to enable the stream + * machinery to understand not to try to split a microzap block). + * + * If large_microzap is enabled, this value will be clamped to + * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE. + */ +static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; + +uint64_t +zap_get_micro_max_size(spa_t *spa) +{ + uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE); + if (maxsz <= SPA_OLD_MAXBLOCKSIZE) + return (maxsz); + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) + return (MIN(maxsz, spa_maxblocksize(spa))); + return (SPA_OLD_MAXBLOCKSIZE); +} static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags); @@ -638,7 +664,7 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > zap_micro_max_size) { + if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { dprintf("upgrading obj %llu: num_entries=%u\n", (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; @@ -650,6 +676,31 @@ zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + + if (newsz > SPA_OLD_MAXBLOCKSIZE) { + dsl_dataset_t *ds = dmu_objset_ds(os); + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_MICROZAP)) { + /* + * A microzap just grew beyond the old limit + * for the first time, so we have to ensure the + * feature flag is activated. + * zap_get_micro_max_size() won't let us get + * here if the feature is not enabled, so we + * don't need any other checks beforehand. + * + * Since we're in open context, we can't + * activate the feature directly, so we instead + * flag it on the dataset for next sync. + */ + dsl_dataset_dirty(ds, tx); + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation + [SPA_FEATURE_LARGE_MICROZAP] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + } } *zapp = zap; diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh index 4b0618017..e10d2936c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh @@ -75,8 +75,8 @@ log_onexit cleanup # excluded because other features depend on them. set -A features \ "hole_birth" \ - "large_blocks" \ "large_dnode" \ + "longname" \ "userobj_accounting" typeset -i i=0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index b5bc46dce..e1fe865b1 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -111,5 +111,6 @@ if is_linux || is_freebsd; then "feature@raidz_expansion" "feature@fast_dedup" "feature@longname" + "feature@large_microzap" ) fi