diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 134c258a1..70096b809 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; -uint8_t dump_opt[256]; +uint8_t dump_opt[512]; + +#define ALLOCATED_OPT 256 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); @@ -1666,6 +1668,16 @@ dump_metaslab_stats(metaslab_t *msp) dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } +static void +dump_allocated(void *arg, uint64_t start, uint64_t size) +{ + uint64_t *off = arg; + if (*off != start) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off, + start - *off); + *off = start + size; +} + static void dump_metaslab(metaslab_t *msp) { @@ -1682,13 +1694,24 @@ dump_metaslab(metaslab_t *msp) (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt['m'] > 2 && !dump_opt['L']) { + if (dump_opt[ALLOCATED_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); + } + + if (dump_opt['m'] > 2 && !dump_opt['L']) { zfs_range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); - metaslab_unload(msp); - mutex_exit(&msp->ms_lock); + } + + if (dump_opt[ALLOCATED_OPT]) { + uint64_t off = msp->ms_start; + zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, + &off); + if (off != msp->ms_start + msp->ms_size) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off, + msp->ms_size - off); } if (dump_opt['m'] > 1 && sm != NULL && @@ -1703,6 +1726,12 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } + if (dump_opt[ALLOCATED_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else @@ -1739,8 +1768,9 @@ print_vdev_metaslab_header(vdev_t *vd) } } - (void) printf("\tvdev %10llu %s", - (u_longlong_t)vd->vdev_id, bias_str); + (void) printf("\tvdev %10llu\t%s metaslab shift %4llu", + (u_longlong_t)vd->vdev_id, bias_str, + (u_longlong_t)vd->vdev_ms_shift); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", @@ -9375,6 +9405,8 @@ main(int argc, char **argv) {"all-reconstruction", no_argument, NULL, 'Y'}, {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, + {"allocated-map", no_argument, NULL, + ALLOCATED_OPT}, {0, 0, 0, 0} }; @@ -9405,6 +9437,7 @@ main(int argc, char **argv) case 'u': case 'y': case 'Z': + case ALLOCATED_OPT: dump_opt[c]++; dump_all = 0; break; diff --git a/cmd/zdb/zdb.h b/cmd/zdb/zdb.h index 6b6c91698..48b561eb2 100644 --- a/cmd/zdb/zdb.h +++ b/cmd/zdb/zdb.h @@ -29,6 +29,6 @@ #define _ZDB_H void dump_intent_log(zilog_t *); -extern uint8_t dump_opt[256]; +extern uint8_t dump_opt[512]; #endif /* _ZDB_H */ diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 62e290cd1..3d91fb28a 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -48,8 +48,6 @@ #include "zdb.h" -extern uint8_t dump_opt[256]; - static char tab_prefix[4] = "\t\t\t"; static void diff --git a/cmd/zhack.c b/cmd/zhack.c index 536532a67..fe1c697e1 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -54,6 +54,7 @@ #include #include #include +#include static importargs_t g_importargs; static char *g_pool; @@ -93,7 +94,10 @@ usage(void) " -c repair corrupted label checksums\n" " -u restore the label on a detached device\n" "\n" - " : path to vdev\n"); + " : path to vdev\n" + "\n" + " metaslab leak \n" + " apply allocation map from zdb to specified pool\n"); exit(1); } @@ -500,6 +504,186 @@ zhack_do_feature(int argc, char **argv) return (0); } +static boolean_t +strstarts(const char *a, const char *b) +{ + return (strncmp(a, b, strlen(b)) == 0); +} + +static void +metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size, + dmu_tx_t *tx) +{ + ASSERT(msp->ms_disabled); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + uint64_t txg = dmu_tx_get_txg(tx); + + uint64_t off = start; + while (off < start + size) { + uint64_t ostart, osize; + boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable, + off, start + size - off, &ostart, &osize); + if (!found) + break; + zfs_range_tree_remove(msp->ms_allocatable, ostart, osize); + + if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp, + txg); + + zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart, + osize); + msp->ms_allocating_total += osize; + off = ostart + osize; + } +} + +static void +zhack_do_metaslab_leak(int argc, char **argv) +{ + int c; + char *target; + spa_t *spa; + + optind = 1; + boolean_t force = B_FALSE; + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER); + + char *line = NULL; + size_t cap = 0; + + vdev_t *vd = NULL; + metaslab_t *prev = NULL; + dmu_tx_t *tx = NULL; + while (getline(&line, &cap, stdin) > 0) { + if (strstarts(line, "\tvdev ")) { + uint64_t vdev_id, ms_shift; + if (sscanf(line, + "\tvdev %10"PRIu64"\t%*s metaslab shift %4"PRIu64, + &vdev_id, &ms_shift) == 1) { + VERIFY3U(sscanf(line, "\tvdev %"PRIu64 + "\t metaslab shift %4"PRIu64, + &vdev_id, &ms_shift), ==, 2); + } + vd = vdev_lookup_top(spa, vdev_id); + if (vd == NULL) { + fprintf(stderr, "error: no such vdev with " + "id %"PRIu64"\n", vdev_id); + break; + } + if (tx) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + tx = NULL; + prev = NULL; + } + if (vd->vdev_ms_shift != ms_shift) { + fprintf(stderr, "error: ms_shift mismatch: %" + PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift, + ms_shift); + break; + } + } else if (strstarts(line, "\tmetaslabs ")) { + uint64_t ms_count; + VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count), + ==, 1); + ASSERT(vd); + if (!force && vd->vdev_ms_count != ms_count) { + fprintf(stderr, "error: ms_count mismatch: %" + PRIu64" != %"PRIu64"\n", vd->vdev_ms_count, + ms_count); + break; + } + } else if (strstarts(line, "ALLOC:")) { + uint64_t start, size; + VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n", + &start, &size), ==, 2); + + ASSERT(vd); + metaslab_t *cur = + vd->vdev_ms[start >> vd->vdev_ms_shift]; + if (prev != cur) { + if (prev) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + } + ASSERT(cur); + metaslab_disable(cur); + mutex_enter(&cur->ms_lock); + metaslab_load(cur); + prev = cur; + tx = dmu_tx_create_dd( + spa_get_dsl(vd->vdev_spa)->dp_root_dir); + dmu_tx_assign(tx, DMU_TX_WAIT); + } + + metaslab_force_alloc(cur, start, size, tx); + } else { + continue; + } + } + if (tx) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + tx = NULL; + prev = NULL; + } + if (line) + free(line); + + spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG); + spa_close(spa, FTAG); +} + +static int +zhack_do_metaslab(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no metaslab operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "leak") == 0) { + zhack_do_metaslab_leak(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + #define ASHIFT_UBERBLOCK_SHIFT(ashift) \ MIN(MAX(ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) @@ -1015,6 +1199,8 @@ main(int argc, char **argv) rv = zhack_do_feature(argc, argv); } else if (strcmp(subcommand, "label") == 0) { return (zhack_do_label(argc, argv)); + } else if (strcmp(subcommand, "metaslab") == 0) { + rv = zhack_do_metaslab(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index b26a37f5d..26d66a452 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -604,5 +604,4 @@ class RaidzExpansionRunning(ZFSError): errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS message = "A raidz device is currently expanding" - # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 index 743bd53b7..ebb136477 100644 --- a/man/man1/zhack.1 +++ b/man/man1/zhack.1 @@ -122,6 +122,24 @@ Example: .Nm zhack Cm label repair Fl cu Ar device Fix checksums and undetach a device . +.It Xo +.Nm zhack +.Cm metaslab leak +.Op Fl f +.Ar pool +.Xc +Apply a fragmentation profile generated by +.Sy zdb +to the specified +.Ar pool Ns +\&. +.Pp +The +.Fl f +flag forces the profile to apply even if the vdevs in the +.Ar pool +don't have the same number of metaslabs as the fragmentation profile. +. .El . .Sh GLOBAL OPTIONS diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index e00544e4a..c3290ea14 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -69,6 +69,13 @@ .Op Fl U Ar cache .Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … .Nm +.Fl -allocated-map +.Op Fl mAFLPXY +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … +.Op Fl t Ar txg +.Op Fl U Ar cache +.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … +.Nm .Fl O .Op Fl K Ar key .Ar dataset path @@ -128,6 +135,11 @@ that zdb may interpret inconsistent pool data and behave erratically. .Sh OPTIONS Display options: .Bl -tag -width Ds +.It Fl Sy -allocated-map +Prints out a list of all the allocated regions in the pool. +Primarily intended for use with the +.Nm zhack metaslab leak +subcommand. .It Fl b , -block-stats Display statistics regarding the number, size .Pq logical, physical and allocated diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index da33fc902..db3f25dc4 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -379,7 +379,7 @@ tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] tests = ['zhack_label_repair_001', 'zhack_label_repair_002', - 'zhack_label_repair_003', 'zhack_label_repair_004'] + 'zhack_label_repair_003', 'zhack_label_repair_004', 'zhack_metaslab_leak'] pre = post = tags = ['functional', 'cli_root', 'zhack'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 116873794..d9ec3686c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1009,6 +1009,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ + functional/cli_root/zhack/zhack_metaslab_leak.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh new file mode 100755 index 000000000..cc72d6313 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack metaslab leak functions correctly +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Gather pool capacity stats +# 3. Generate fragmentation data with zdb +# 4. Destroy the pool +# 5. Create a new pool with the same configuration +# 6. Export the pool +# 7. Apply the fragmentation information with zhack metaslab leak +# 8. Import the pool +# 9. Verify that pool capacity stats match + +. "$STF_SUITE"/include/libtest.shlib + +verify_runnable "global" + +function cleanup +{ + zpool destroy $TESTPOOL + rm $tmp +} + +log_onexit cleanup +log_assert "zhack metaslab leak leaks the right amount of space" + +typeset tmp=$(mktemp) + +log_must zpool create $TESTPOOL $DISKS +for i in `seq 1 16`; do + log_must dd if=/dev/urandom of=/$TESTPOOL/f$i bs=1M count=16 + log_must zpool sync $TESTPOOL +done +for i in `seq 2 2 16`; do + log_must rm /$TESTPOOL/f$i +done +for i in `seq 1 16`; do + log_must touch /$TESTPOOL/g$i + log_must zpool sync $TESTPOOL +done + +alloc=$(zpool get -Hpo value alloc $TESTPOOL) +log_must eval "zdb -m --allocated-map $TESTPOOL > $tmp" +log_must zpool destroy $TESTPOOL + +log_must zpool create $TESTPOOL $DISKS +log_must zpool export $TESTPOOL +log_must eval "zhack metaslab leak $TESTPOOL < $tmp" +log_must zpool import $TESTPOOL + +alloc2=$(zpool get -Hpo value alloc $TESTPOOL) + +[[ $((alloc * 1.05)) -gt $alloc2 ]] && [[ $alloc -lt $alloc2 ]] || \ + log_fail "space usage changed too much: $alloc to $alloc2" + +log_pass "zhack metaslab leak behaved correctly"