diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dec70c60c..fcf0e4779 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -33,7 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, Rob Norris */ @@ -3287,9 +3287,46 @@ fuid_table_destroy(void) } } +/* + * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on + * a live pool are normally cleaned up during ddt_sync(). We can't do that (and + * wouldn't want to anyway), but if we don't clean up the presence of stuff on + * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. + * + * Note that this is not a particularly efficient way to do this, but + * ddt_remove() is the only public method that can do the work we need, and it + * requires the right locks and etc to do the job. This is only ever called + * during zdb shutdown so efficiency is not especially important. + */ +static void +zdb_ddt_cleanup(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + ddt_enter(ddt); + ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; + while (dde) { + next = AVL_NEXT(&ddt->ddt_tree, dde); + memset(&dde->dde_lead_zio, 0, + sizeof (dde->dde_lead_zio)); + ddt_remove(ddt, dde); + dde = next; + } + ddt_exit(ddt); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } +} + static void zdb_exit(int reason) { + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { @@ -5633,7 +5670,6 @@ static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { - uint64_t refcnt = 0; int i; ASSERT(type < ZDB_OT_TOTAL); @@ -5641,8 +5677,144 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; + /* + * This flag controls if we will issue a claim for the block while + * counting it, to ensure that all blocks are referenced in space maps. + * We don't issue claims if we're not doing leak tracking, because it's + * expensive if the user isn't interested. We also don't claim the + * second or later occurences of cloned or dedup'd blocks, because we + * already claimed them the first time. + */ + boolean_t do_claim = !dump_opt['L']; + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + if (BP_GET_DEDUP(bp)) { + /* + * Dedup'd blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * We use the existing dedup system to track what we've seen. + * The first time we see a block, we do a ddt_lookup() to see + * if it exists in the DDT. If we're doing leak tracking, we + * claim the block at this time. + * + * Each time we see a block, we reduce the refcount in the + * entry by one, and add to the size and count of dedup'd + * blocks to report at the end. + */ + + ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); + + ddt_enter(ddt); + + /* + * Find the block. This will create the entry in memory, but + * we'll know if that happened by its refcount. + */ + ddt_entry_t *dde = ddt_lookup(ddt, bp); + + /* + * ddt_lookup() can only return NULL if this block didn't exist + * in the DDT and creating it would take the DDT over its + * quota. Since we got the block from disk, it must exist in + * the DDT, so this can't happen. + */ + VERIFY3P(dde, !=, NULL); + + /* Get the phys for this variant */ + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + VERIFY3P(ddp, !=, NULL); + + /* + * This entry may have multiple sets of DVAs. We must claim + * each set the first time we see them in a real block on disk, + * or count them on subsequent occurences. We don't have a + * convenient way to track the first time we see each variant, + * so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We + * can do this safely in zdb because it never writes, so it + * will never have a writing zio for this block in that + * pointer. + */ + + /* + * Work out which dde_phys index was used, get the seen flag, + * and update it if necessary. + */ + uint_t idx = + ((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) / + sizeof (ddt_phys_t); + VERIFY3P(ddp, ==, &dde->dde_phys[idx]); + boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx]; + if (!seen) + dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE; + + /* Consume a reference for this block. */ + VERIFY3U(ddt_phys_total_refcnt(dde), >, 0); + ddt_phys_decref(ddp); + + if (seen) { + /* + * The second or later time we see this block, + * it's a duplicate and we count it. + */ + zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); + zcb->zcb_dedup_blocks++; + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + + ddt_exit(ddt); + } else if (zcb->zcb_brt_is_active && + brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre == NULL) { + /* Not seen before; track it */ + uint64_t refcnt = + brt_entry_get_refcount(zcb->zcb_spa, bp); + if (refcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = refcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } else { + /* + * Second or later occurrence, count it and take a + * refcount. + */ + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + } + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -5745,71 +5917,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); - if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { - /* - * Cloned blocks are special. We need to count them, so we can - * later uncount them when reporting leaked space, and we must - * only claim them them once. - * - * To do this, we keep our own in-memory BRT. For each block - * we haven't seen before, we look it up in the real BRT and - * if its there, we note it and its refcount then proceed as - * normal. If we see the block again, we count it as a clone - * and then give it no further consideration. - */ - zdb_brt_entry_t zbre_search, *zbre; - avl_index_t where; - - zbre_search.zbre_dva = bp->blk_dva[0]; - zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); - if (zbre != NULL) { - zcb->zcb_clone_asize += BP_GET_ASIZE(bp); - zcb->zcb_clone_blocks++; - - zbre->zbre_refcount--; - if (zbre->zbre_refcount == 0) { - avl_remove(&zcb->zcb_brt, zbre); - umem_free(zbre, sizeof (zdb_brt_entry_t)); - } - return; - } - - uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); - if (crefcnt > 0) { - zbre = umem_zalloc(sizeof (zdb_brt_entry_t), - UMEM_NOFAIL); - zbre->zbre_dva = bp->blk_dva[0]; - zbre->zbre_refcount = crefcnt; - avl_insert(&zcb->zcb_brt, zbre, where); - } - } - - if (dump_opt['L']) + if (!do_claim) return; - if (BP_GET_DEDUP(bp)) { - ddt_t *ddt; - ddt_entry_t *dde; - - ddt = ddt_select(zcb->zcb_spa, bp); - ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_FALSE); - - if (dde == NULL) { - refcnt = 0; - } else { - ddt_phys_t *ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; - if (ddt_phys_total_refcnt(dde) == 0) - ddt_remove(ddt, dde); - } - ddt_exit(ddt); - } - - VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, - refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), - bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); + VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, + spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, + ZIO_FLAG_CANFAIL))); } static void @@ -6120,49 +6233,6 @@ zdb_load_obsolete_counts(vdev_t *vd) return (counts); } -static void -zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) -{ - ddt_bookmark_t ddb = {0}; - ddt_entry_t dde; - int error; - int p; - - ASSERT(!dump_opt['L']); - - while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { - blkptr_t blk; - ddt_phys_t *ddp = dde.dde_phys; - - if (ddb.ddb_class == DDT_CLASS_UNIQUE) - return; - - ASSERT(ddt_phys_total_refcnt(&dde) > 1); - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - VERIFY(ddt); - - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(ddb.ddb_checksum, - &dde.dde_key, ddp, &blk); - if (p == DDT_PHYS_DITTO) { - zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); - } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); - zcb->zcb_dedup_blocks++; - } - } - - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } - - ASSERT(error == ENOENT); -} - typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; @@ -6546,10 +6616,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - zdb_ddt_leak_init(spa, zcb); - spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t @@ -6814,6 +6880,8 @@ dump_block_stats(spa_t *spa) int e, c, err; bp_embedded_type_t i; + ddt_prefetch_all(spa); + zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { @@ -6938,7 +7006,6 @@ dump_block_stats(spa_t *spa) (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); - leaks = B_TRUE; } if (tzb->zb_count == 0) { @@ -8022,16 +8089,21 @@ dump_mos_leaks(spa_t *spa) mos_leak_vdev(spa->spa_root_vdev); - for (uint64_t class = 0; class < DDT_CLASSES; class++) { - for (uint64_t type = 0; type < DDT_TYPES; type++) { - for (uint64_t cksum = 0; - cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { - ddt_t *ddt = spa->spa_ddt[cksum]; - if (!ddt) - continue; + for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + /* DDT store objects */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { mos_obj_refd(ddt->ddt_object[type][class]); } } + + /* FDT container */ + mos_obj_refd(ddt->ddt_dir_object); } if (spa->spa_brt != NULL) { @@ -9624,6 +9696,9 @@ retry_lookup: } fini: + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 02d0cf5da..20bae8ce0 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -253,7 +253,7 @@ extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); -extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch_all(spa_t *spa); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 7e2010c42..84d7800cb 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -715,7 +715,7 @@ ddt_prefetch_all(spa_t *spa) static int ddt_configure(ddt_t *ddt, boolean_t new); ddt_entry_t * -ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { spa_t *spa = ddt->ddt_spa; ddt_key_t search; @@ -767,10 +767,6 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) return (dde); } - /* Not found. */ - if (!add) - return (NULL); - /* Time to make a new entry. */ dde = ddt_alloc(&search); avl_insert(&ddt->ddt_tree, dde, where); @@ -1502,7 +1498,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt = ddt_select(spa, bp); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); /* Can be NULL if the entry for this block was pruned. */ if (dde == NULL) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6d08d4bd1..5810e811a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3518,7 +3518,7 @@ zio_ddt_write(zio_t *zio) ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; @@ -3598,7 +3598,7 @@ zio_ddt_free(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + freedde = dde = ddt_lookup(ddt, bp); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp)