From bee9cfb813f45f8458a8fd7584e526124d2e2d03 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Sat, 26 Aug 2023 11:34:43 -0700 Subject: [PATCH] Increase limit of redaction list by using spill block Currently redaction bookmarks and their associated redaction lists have a relatively low limit of 36 redaction snapshots. This is imposed by the number of snapshot GUIDs that fit in the bonus buffer of the redaction list object. While this is more than enough for most use cases, there are some limited cases where larger numbers would be useful to support. We tweak the redaction list creation code to use a spill block if the number of redaction snapshots is above the amount that would fit in the bonus buffer. We also make a small change to allow spill blocks to be use for types of data besides SA. In order to fully leverage this logic, we also change the redaction code to use vmem_alloc, to handle extremely large allocations if needed. Finally, small tweaks were made to the zfs commands and the test suite. Reviewed-by: Matthew Ahrens Signed-off-by: Paul Dagnelie Closes #15018 --- cmd/zdb/zdb.c | 15 ++++- cmd/zfs/zfs_main.c | 4 ++ include/sys/dsl_bookmark.h | 1 + include/zfeature_common.h | 1 + lib/libzfs/libzfs.abi | 9 +-- man/man7/zpool-features.7 | 12 ++++ module/zcommon/zfeature_common.c | 12 ++++ module/zfs/dmu_redact.c | 17 +++-- module/zfs/dnode.c | 1 + module/zfs/dsl_bookmark.c | 67 ++++++++++++++----- module/zfs/dsl_destroy.c | 10 +++ .../cli_root/zpool_get/zpool_get.cfg | 1 + .../redacted_send/redacted_many_clones.ksh | 12 ++-- 13 files changed, 128 insertions(+), 34 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4b9921d47..87499cdc9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5293,8 +5293,18 @@ dump_one_objset(const char *dsname, void *arg) avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); - if (dbn->dbn_phys.zbm_redaction_obj != 0) - global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + global_feature_count[ + SPA_FEATURE_REDACTION_BOOKMARKS]++; + objset_t *mos = os->os_spa->spa_meta_objset; + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); + if (rl->dn_have_spill) { + global_feature_count[ + SPA_FEATURE_REDACTION_LIST_SPILL]++; + } + } if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } @@ -8135,6 +8145,7 @@ dump_zpool(spa_t *spa) for (spa_feature_t f = 0; f < SPA_FEATURES; f++) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; global_feature_count[SPA_FEATURE_LIVELIST] = 0; diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 5ed25d1ea..ea73bb018 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3978,6 +3978,10 @@ zfs_do_redact(int argc, char **argv) (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; + case ESRCH: + (void) fprintf(stderr, gettext("attempted to resume redaction " + " with a mismatched redaction list\n")); + break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h index 353c5c2d2..d4e559a09 100644 --- a/include/sys/dsl_bookmark.h +++ b/include/sys/dsl_bookmark.h @@ -72,6 +72,7 @@ typedef struct redaction_list_phys { typedef struct redaction_list { dmu_buf_user_t rl_dbu; redaction_list_phys_t *rl_phys; + dmu_buf_t *rl_bonus; dmu_buf_t *rl_dbuf; uint64_t rl_object; zfs_refcount_t rl_longholds; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 7066c699e..1025c4473 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -80,6 +80,7 @@ typedef enum spa_feature { SPA_FEATURE_BLAKE3, SPA_FEATURE_BLOCK_CLONING, SPA_FEATURE_AVZ_V2, + SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 6e53bcb41..0a8e9bcbd 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -596,7 +596,7 @@ - + @@ -5809,7 +5809,8 @@ - + + @@ -8706,8 +8707,8 @@ - - + + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b901ce6c2..3c7b0b345 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -947,6 +947,18 @@ once all filesystems that have ever had their property set to .Sy zstd are destroyed. +. +.feature com.delphix redaction_list_spill no redaction_bookmarks +This feature enables the redaction list created by zfs redact to store +many more entries. +It becomes +.Sy active +when a redaction list is created with more than 36 entries, +and returns to being +.Sy enabled +when no long redaction lists remain in the pool. +For more information about redacted sends, see +.Xr zfs-send 8 . .El . .Sh SEE ALSO diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 4c9b7ed72..2c74d10f4 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -737,6 +737,18 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t redact_list_spill_deps[] = { + SPA_FEATURE_REDACTION_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTION_LIST_SPILL, + "com.delphix:redaction_list_spill", "redaction_list_spill", + "Support for increased number of redaction_snapshot " + "arguments in zfs redact.", 0, ZFEATURE_TYPE_BOOLEAN, + redact_list_spill_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 6bd35713f..5ac14edfc 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -746,7 +746,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, bqueue_enqueue(q, record, sizeof (*record)); return (0); } - redact_nodes = kmem_zalloc(num_threads * + redact_nodes = vmem_zalloc(num_threads * sizeof (*redact_nodes), KM_SLEEP); avl_create(&start_tree, redact_node_compare_start, @@ -820,7 +820,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_destroy(&start_tree); avl_destroy(&end_tree); - kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); + vmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); @@ -1030,7 +1030,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, numsnaps = fnvlist_num_pairs(redactnvl); if (numsnaps > 0) - args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + args = vmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); nvpair_t *pair = NULL; for (int i = 0; i < numsnaps; i++) { @@ -1079,7 +1079,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); if (args != NULL) - kmem_free(args, numsnaps * sizeof (*args)); + vmem_free(args, numsnaps * sizeof (*args)); return (SET_ERROR(ENAMETOOLONG)); } err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); @@ -1119,7 +1119,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, } else { uint64_t *guids = NULL; if (numsnaps > 0) { - guids = kmem_zalloc(numsnaps * sizeof (uint64_t), + guids = vmem_zalloc(numsnaps * sizeof (uint64_t), KM_SLEEP); } for (int i = 0; i < numsnaps; i++) { @@ -1131,10 +1131,9 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, dp = NULL; err = dsl_bookmark_create_redacted(newredactbook, snapname, numsnaps, guids, FTAG, &new_rl); - kmem_free(guids, numsnaps * sizeof (uint64_t)); - if (err != 0) { + vmem_free(guids, numsnaps * sizeof (uint64_t)); + if (err != 0) goto out; - } } for (int i = 0; i < numsnaps; i++) { @@ -1188,7 +1187,7 @@ out: } if (args != NULL) - kmem_free(args, numsnaps * sizeof (*args)); + vmem_free(args, numsnaps * sizeof (*args)); if (dp != NULL) dsl_pool_rele(dp, FTAG); if (ds != NULL) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7cf03264d..79fd02dcb 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -720,6 +720,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || + (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index e04796a08..03d9420db 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -34,6 +34,7 @@ #include #include #include +#include static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, @@ -459,25 +460,42 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); if (redaction_list != NULL || bookmark_redacted) { redaction_list_t *local_rl; + boolean_t spill = B_FALSE; if (bookmark_redacted) { redact_snaps = dsredactsnaps; num_redact_snaps = dsnumsnaps; } + int bonuslen = sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t); + if (bonuslen > dmu_bonus_max()) + spill = B_TRUE; dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, - DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + - num_redact_snaps * sizeof (uint64_t), tx); + DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx); spa_feature_incr(dp->dp_spa, SPA_FEATURE_REDACTION_BOOKMARKS, tx); + if (spill) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_LIST_SPILL, tx); + } VERIFY0(dsl_redaction_list_hold_obj(dp, dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); dsl_redaction_list_long_hold(dp, local_rl, tag); - ASSERT3U((local_rl)->rl_dbuf->db_size, >=, - sizeof (redaction_list_phys_t) + num_redact_snaps * - sizeof (uint64_t)); - dmu_buf_will_dirty(local_rl->rl_dbuf, tx); + if (!spill) { + ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen); + dmu_buf_will_dirty(local_rl->rl_bonus, tx); + } else { + dmu_buf_t *db; + VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus, + DB_RF_MUST_SUCCEED, FTAG, &db)); + dmu_buf_will_fill(db, tx); + VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen, + SPA_MINBLOCKSIZE), tx)); + local_rl->rl_phys = db->db_data; + local_rl->rl_dbuf = db; + } memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps, sizeof (uint64_t) * num_redact_snaps); local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; @@ -636,11 +654,15 @@ dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) SPA_FEATURE_REDACTION_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); /* - * If the list of redact snaps will not fit in the bonus buffer with - * the furthest reached object and offset, fail. + * If the list of redact snaps will not fit in the bonus buffer (or + * spill block, with the REDACTION_LIST_SPILL feature) with the + * furthest reached object and offset, fail. */ - if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - - sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) : + dmu_bonus_max()) - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t); + if (dbcra->dbcra_numsnaps > snaplimit) return (SET_ERROR(E2BIG)); if (dsl_bookmark_create_nvl_validate_pair( @@ -1040,6 +1062,14 @@ dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, } if (dbn->dbn_phys.zbm_redaction_obj != 0) { + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); + if (rl->dn_have_spill) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_LIST_SPILL, tx); + } + dnode_rele(rl, FTAG); VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), @@ -1213,7 +1243,9 @@ redaction_list_evict_sync(void *rlu) void dsl_redaction_list_rele(redaction_list_t *rl, const void *tag) { - dmu_buf_rele(rl->rl_dbuf, tag); + if (rl->rl_bonus != rl->rl_dbuf) + dmu_buf_rele(rl->rl_dbuf, tag); + dmu_buf_rele(rl->rl_bonus, tag); } int @@ -1221,7 +1253,7 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag, redaction_list_t **rlp) { objset_t *mos = dp->dp_meta_objset; - dmu_buf_t *dbuf; + dmu_buf_t *dbuf, *spill_dbuf; redaction_list_t *rl; int err; @@ -1236,13 +1268,18 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag, redaction_list_t *winner = NULL; rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); - rl->rl_dbuf = dbuf; + rl->rl_bonus = dbuf; + if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) { + rl->rl_dbuf = spill_dbuf; + } else { + rl->rl_dbuf = dbuf; + } rl->rl_object = rlobj; - rl->rl_phys = dbuf->db_data; + rl->rl_phys = rl->rl_dbuf->db_data; rl->rl_mos = dp->dp_meta_objset; zfs_refcount_create(&rl->rl_longholds); dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, - &rl->rl_dbuf); + &rl->rl_bonus); if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { kmem_free(rl, sizeof (*rl)); rl = winner; diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 053f26878..d9d88a981 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1125,6 +1125,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { if (dbn->dbn_phys.zbm_redaction_obj != 0) { + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, + &rl)); + if (rl->dn_have_spill) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_LIST_SPILL, + tx); + } + dnode_rele(rl, FTAG); VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 160a0ca2e..4248578cd 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -86,6 +86,7 @@ typeset -a properties=( "feature@log_spacemap" "feature@device_rebuild" "feature@draid" + "feature@redaction_list_spill" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh index 3386643b2..f2150be3b 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh @@ -27,7 +27,6 @@ # second (the last block in the file) is common to them all. # 2. Verify a redacted stream with a reasonable redaction list length can # be correctly processed. -# 3. Verify that if the list is too long, the send fails gracefully. # typeset ds_name="many_clones" @@ -56,13 +55,18 @@ for i in {1..64}; do log_must zfs snapshot ${clone}$i@snap done -# The limit isn't necessarily 32 snapshots. The maximum number of snapshots in +# The limit isn't necessarily 64 snapshots. The maximum number of snapshots in # the redacted list is determined in dsl_bookmark_create_redacted_check(). -log_must zfs redact $sendfs@snap book1 $clone{1..32}@snap +log_must zfs redact $sendfs@snap book1 $clone{1..64}@snap log_must eval "zfs send --redact book1 $sendfs@snap >$stream" log_must eval "zfs recv $recvfs <$stream" compare_files $sendfs $recvfs "f2" "$RANGE8" -log_mustnot zfs redact $sendfs@snap book2 $clone{1..64}@snap +rls_value="$(zpool get -H -o value feature@redaction_list_spill $POOL)" +if [ "$rls_value" = "active" ]; then + log_note "redaction_list_spill feature active" +else + log_fail "redaction_list_spill feature not active" +fi log_pass "Redacted send can deal with a large redaction list."