Add allocation profile export and zhack subcommand for import

When attempting to debug performance problems on large systems, one of
the major factors that affect performance is free space
fragmentation. This heavily affects the allocation process, which is an
area of active development in ZFS. Unfortunately, fragmenting a large
pool for testing purposes is time consuming; it usually involves filling
the pool and then repeatedly overwriting data until the free space
becomes fragmented, which can take many hours. And even if the time is
available, artificial workloads rarely generate the same fragmentation
patterns as the natural workloads they're attempting to mimic.

This patch has two parts. First, in zdb, we add the ability to export
the full allocation map of the pool. It iterates over each vdev,
printing every allocated segment in the ms_allocatable range tree. This
can be done while the pool is online, though in that case the allocation
map may actually be from several different TXGs as new ones are loaded
on demand.

The second is a new subcommand for zhack, zhack metaslab leak (and its
supporting kernel changes). This is a zhack subcommand that imports a
pool and then modified the range trees of the metaslabs, allowing the
sync process to write them out normall. It does not currently store
those allocations anywhere to make them reversible, and there is no
corresponding free subcommand (which would be extremely dangerous); this
is an irreversible process, only intended for performance testing. The
only way to reclaim the space afterwards is to destroy the pool or roll
back to a checkpoint.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #17576
This commit is contained in:
Paul Dagnelie
2025-04-24 13:01:00 -07:00
committed by Tony Hutter
parent 9a5027ccce
commit 411249498e
10 changed files with 329 additions and 12 deletions
+39 -6
View File
@@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max;
extern uint_t zfs_btree_verify_intensity;
static const char cmdname[] = "zdb";
uint8_t dump_opt[256];
uint8_t dump_opt[512];
#define ALLOCATED_OPT 256
typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
@@ -1651,6 +1653,16 @@ dump_metaslab_stats(metaslab_t *msp)
dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
}
static void
dump_allocated(void *arg, uint64_t start, uint64_t size)
{
uint64_t *off = arg;
if (*off != start)
(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off,
start - *off);
*off = start + size;
}
static void
dump_metaslab(metaslab_t *msp)
{
@@ -1667,13 +1679,24 @@ dump_metaslab(metaslab_t *msp)
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
(u_longlong_t)space_map_object(sm), freebuf);
if (dump_opt['m'] > 2 && !dump_opt['L']) {
if (dump_opt[ALLOCATED_OPT] ||
(dump_opt['m'] > 2 && !dump_opt['L'])) {
mutex_enter(&msp->ms_lock);
VERIFY0(metaslab_load(msp));
}
if (dump_opt['m'] > 2 && !dump_opt['L']) {
zfs_range_tree_stat_verify(msp->ms_allocatable);
dump_metaslab_stats(msp);
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);
}
if (dump_opt[ALLOCATED_OPT]) {
uint64_t off = msp->ms_start;
zfs_range_tree_walk(msp->ms_allocatable, dump_allocated,
&off);
if (off != msp->ms_start + msp->ms_size)
(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off,
msp->ms_size - off);
}
if (dump_opt['m'] > 1 && sm != NULL &&
@@ -1688,6 +1711,12 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
if (dump_opt[ALLOCATED_OPT] ||
(dump_opt['m'] > 2 && !dump_opt['L'])) {
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);
}
if (vd->vdev_ops == &vdev_draid_ops)
ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
else
@@ -1724,8 +1753,9 @@ print_vdev_metaslab_header(vdev_t *vd)
}
}
(void) printf("\tvdev %10llu %s",
(u_longlong_t)vd->vdev_id, bias_str);
(void) printf("\tvdev %10llu\t%s metaslab shift %4llu",
(u_longlong_t)vd->vdev_id, bias_str,
(u_longlong_t)vd->vdev_ms_shift);
if (ms_flush_data_obj != 0) {
(void) printf(" ms_unflushed_phys object %llu",
@@ -9318,6 +9348,8 @@ main(int argc, char **argv)
{"all-reconstruction", no_argument, NULL, 'Y'},
{"livelist", no_argument, NULL, 'y'},
{"zstd-headers", no_argument, NULL, 'Z'},
{"allocated-map", no_argument, NULL,
ALLOCATED_OPT},
{0, 0, 0, 0}
};
@@ -9348,6 +9380,7 @@ main(int argc, char **argv)
case 'u':
case 'y':
case 'Z':
case ALLOCATED_OPT:
dump_opt[c]++;
dump_all = 0;
break;
+1 -1
View File
@@ -29,6 +29,6 @@
#define _ZDB_H
void dump_intent_log(zilog_t *);
extern uint8_t dump_opt[256];
extern uint8_t dump_opt[512];
#endif /* _ZDB_H */
-2
View File
@@ -48,8 +48,6 @@
#include "zdb.h"
extern uint8_t dump_opt[256];
static char tab_prefix[4] = "\t\t\t";
static void
+187 -1
View File
@@ -54,6 +54,7 @@
#include <sys/dmu_tx.h>
#include <zfeature_common.h>
#include <libzutil.h>
#include <sys/metaslab_impl.h>
static importargs_t g_importargs;
static char *g_pool;
@@ -93,7 +94,10 @@ usage(void)
" -c repair corrupted label checksums\n"
" -u restore the label on a detached device\n"
"\n"
" <device> : path to vdev\n");
" <device> : path to vdev\n"
"\n"
" metaslab leak <pool>\n"
" apply allocation map from zdb to specified pool\n");
exit(1);
}
@@ -500,6 +504,186 @@ zhack_do_feature(int argc, char **argv)
return (0);
}
static boolean_t
strstarts(const char *a, const char *b)
{
return (strncmp(a, b, strlen(b)) == 0);
}
static void
metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size,
dmu_tx_t *tx)
{
ASSERT(msp->ms_disabled);
ASSERT(MUTEX_HELD(&msp->ms_lock));
uint64_t txg = dmu_tx_get_txg(tx);
uint64_t off = start;
while (off < start + size) {
uint64_t ostart, osize;
boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable,
off, start + size - off, &ostart, &osize);
if (!found)
break;
zfs_range_tree_remove(msp->ms_allocatable, ostart, osize);
if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp,
txg);
zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart,
osize);
msp->ms_allocating_total += osize;
off = ostart + osize;
}
}
static void
zhack_do_metaslab_leak(int argc, char **argv)
{
int c;
char *target;
spa_t *spa;
optind = 1;
boolean_t force = B_FALSE;
while ((c = getopt(argc, argv, "f")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
break;
default:
usage();
break;
}
}
argc -= optind;
argv += optind;
if (argc < 1) {
(void) fprintf(stderr, "error: missing pool name\n");
usage();
}
target = argv[0];
zhack_spa_open(target, B_FALSE, FTAG, &spa);
spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER);
char *line = NULL;
size_t cap = 0;
vdev_t *vd = NULL;
metaslab_t *prev = NULL;
dmu_tx_t *tx = NULL;
while (getline(&line, &cap, stdin) > 0) {
if (strstarts(line, "\tvdev ")) {
uint64_t vdev_id, ms_shift;
if (sscanf(line,
"\tvdev %10"PRIu64"\t%*s metaslab shift %4"PRIu64,
&vdev_id, &ms_shift) == 1) {
VERIFY3U(sscanf(line, "\tvdev %"PRIu64
"\t metaslab shift %4"PRIu64,
&vdev_id, &ms_shift), ==, 2);
}
vd = vdev_lookup_top(spa, vdev_id);
if (vd == NULL) {
fprintf(stderr, "error: no such vdev with "
"id %"PRIu64"\n", vdev_id);
break;
}
if (tx) {
dmu_tx_commit(tx);
mutex_exit(&prev->ms_lock);
metaslab_enable(prev, B_FALSE, B_FALSE);
tx = NULL;
prev = NULL;
}
if (vd->vdev_ms_shift != ms_shift) {
fprintf(stderr, "error: ms_shift mismatch: %"
PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift,
ms_shift);
break;
}
} else if (strstarts(line, "\tmetaslabs ")) {
uint64_t ms_count;
VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count),
==, 1);
ASSERT(vd);
if (!force && vd->vdev_ms_count != ms_count) {
fprintf(stderr, "error: ms_count mismatch: %"
PRIu64" != %"PRIu64"\n", vd->vdev_ms_count,
ms_count);
break;
}
} else if (strstarts(line, "ALLOC:")) {
uint64_t start, size;
VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n",
&start, &size), ==, 2);
ASSERT(vd);
metaslab_t *cur =
vd->vdev_ms[start >> vd->vdev_ms_shift];
if (prev != cur) {
if (prev) {
dmu_tx_commit(tx);
mutex_exit(&prev->ms_lock);
metaslab_enable(prev, B_FALSE, B_FALSE);
}
ASSERT(cur);
metaslab_disable(cur);
mutex_enter(&cur->ms_lock);
metaslab_load(cur);
prev = cur;
tx = dmu_tx_create_dd(
spa_get_dsl(vd->vdev_spa)->dp_root_dir);
dmu_tx_assign(tx, DMU_TX_WAIT);
}
metaslab_force_alloc(cur, start, size, tx);
} else {
continue;
}
}
if (tx) {
dmu_tx_commit(tx);
mutex_exit(&prev->ms_lock);
metaslab_enable(prev, B_FALSE, B_FALSE);
tx = NULL;
prev = NULL;
}
if (line)
free(line);
spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG);
spa_close(spa, FTAG);
}
static int
zhack_do_metaslab(int argc, char **argv)
{
char *subcommand;
argc--;
argv++;
if (argc == 0) {
(void) fprintf(stderr,
"error: no metaslab operation specified\n");
usage();
}
subcommand = argv[0];
if (strcmp(subcommand, "leak") == 0) {
zhack_do_metaslab_leak(argc, argv);
} else {
(void) fprintf(stderr, "error: unknown subcommand: %s\n",
subcommand);
usage();
}
return (0);
}
#define ASHIFT_UBERBLOCK_SHIFT(ashift) \
MIN(MAX(ashift, UBERBLOCK_SHIFT), \
MAX_UBERBLOCK_SHIFT)
@@ -1015,6 +1199,8 @@ main(int argc, char **argv)
rv = zhack_do_feature(argc, argv);
} else if (strcmp(subcommand, "label") == 0) {
return (zhack_do_label(argc, argv));
} else if (strcmp(subcommand, "metaslab") == 0) {
rv = zhack_do_metaslab(argc, argv);
} else {
(void) fprintf(stderr, "error: unknown subcommand: %s\n",
subcommand);