mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-01-14 17:22:05 +03:00
Add allocation profile export and zhack subcommand for import
When attempting to debug performance problems on large systems, one of the major factors that affect performance is free space fragmentation. This heavily affects the allocation process, which is an area of active development in ZFS. Unfortunately, fragmenting a large pool for testing purposes is time consuming; it usually involves filling the pool and then repeatedly overwriting data until the free space becomes fragmented, which can take many hours. And even if the time is available, artificial workloads rarely generate the same fragmentation patterns as the natural workloads they're attempting to mimic. This patch has two parts. First, in zdb, we add the ability to export the full allocation map of the pool. It iterates over each vdev, printing every allocated segment in the ms_allocatable range tree. This can be done while the pool is online, though in that case the allocation map may actually be from several different TXGs as new ones are loaded on demand. The second is a new subcommand for zhack, zhack metaslab leak (and its supporting kernel changes). This is a zhack subcommand that imports a pool and then modified the range trees of the metaslabs, allowing the sync process to write them out normall. It does not currently store those allocations anywhere to make them reversible, and there is no corresponding free subcommand (which would be extremely dangerous); this is an irreversible process, only intended for performance testing. The only way to reclaim the space afterwards is to destroy the pool or roll back to a checkpoint. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #17576
This commit is contained in:
parent
ca4f7d6d49
commit
26983d6fa7
@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max;
|
||||
extern uint_t zfs_btree_verify_intensity;
|
||||
|
||||
static const char cmdname[] = "zdb";
|
||||
uint8_t dump_opt[256];
|
||||
uint8_t dump_opt[512];
|
||||
|
||||
#define ALLOCATED_OPT 256
|
||||
|
||||
typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
|
||||
|
||||
@ -1666,6 +1668,16 @@ dump_metaslab_stats(metaslab_t *msp)
|
||||
dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
dump_allocated(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
uint64_t *off = arg;
|
||||
if (*off != start)
|
||||
(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off,
|
||||
start - *off);
|
||||
*off = start + size;
|
||||
}
|
||||
|
||||
static void
|
||||
dump_metaslab(metaslab_t *msp)
|
||||
{
|
||||
@ -1682,13 +1694,24 @@ dump_metaslab(metaslab_t *msp)
|
||||
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
|
||||
(u_longlong_t)space_map_object(sm), freebuf);
|
||||
|
||||
if (dump_opt['m'] > 2 && !dump_opt['L']) {
|
||||
if (dump_opt[ALLOCATED_OPT] ||
|
||||
(dump_opt['m'] > 2 && !dump_opt['L'])) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY0(metaslab_load(msp));
|
||||
}
|
||||
|
||||
if (dump_opt['m'] > 2 && !dump_opt['L']) {
|
||||
zfs_range_tree_stat_verify(msp->ms_allocatable);
|
||||
dump_metaslab_stats(msp);
|
||||
metaslab_unload(msp);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
if (dump_opt[ALLOCATED_OPT]) {
|
||||
uint64_t off = msp->ms_start;
|
||||
zfs_range_tree_walk(msp->ms_allocatable, dump_allocated,
|
||||
&off);
|
||||
if (off != msp->ms_start + msp->ms_size)
|
||||
(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off,
|
||||
msp->ms_size - off);
|
||||
}
|
||||
|
||||
if (dump_opt['m'] > 1 && sm != NULL &&
|
||||
@ -1703,6 +1726,12 @@ dump_metaslab(metaslab_t *msp)
|
||||
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
|
||||
}
|
||||
|
||||
if (dump_opt[ALLOCATED_OPT] ||
|
||||
(dump_opt['m'] > 2 && !dump_opt['L'])) {
|
||||
metaslab_unload(msp);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops == &vdev_draid_ops)
|
||||
ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
|
||||
else
|
||||
@ -1739,8 +1768,9 @@ print_vdev_metaslab_header(vdev_t *vd)
|
||||
}
|
||||
}
|
||||
|
||||
(void) printf("\tvdev %10llu %s",
|
||||
(u_longlong_t)vd->vdev_id, bias_str);
|
||||
(void) printf("\tvdev %10llu\t%s metaslab shift %4llu",
|
||||
(u_longlong_t)vd->vdev_id, bias_str,
|
||||
(u_longlong_t)vd->vdev_ms_shift);
|
||||
|
||||
if (ms_flush_data_obj != 0) {
|
||||
(void) printf(" ms_unflushed_phys object %llu",
|
||||
@ -9375,6 +9405,8 @@ main(int argc, char **argv)
|
||||
{"all-reconstruction", no_argument, NULL, 'Y'},
|
||||
{"livelist", no_argument, NULL, 'y'},
|
||||
{"zstd-headers", no_argument, NULL, 'Z'},
|
||||
{"allocated-map", no_argument, NULL,
|
||||
ALLOCATED_OPT},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
@ -9405,6 +9437,7 @@ main(int argc, char **argv)
|
||||
case 'u':
|
||||
case 'y':
|
||||
case 'Z':
|
||||
case ALLOCATED_OPT:
|
||||
dump_opt[c]++;
|
||||
dump_all = 0;
|
||||
break;
|
||||
|
||||
@ -29,6 +29,6 @@
|
||||
#define _ZDB_H
|
||||
|
||||
void dump_intent_log(zilog_t *);
|
||||
extern uint8_t dump_opt[256];
|
||||
extern uint8_t dump_opt[512];
|
||||
|
||||
#endif /* _ZDB_H */
|
||||
|
||||
@ -48,8 +48,6 @@
|
||||
|
||||
#include "zdb.h"
|
||||
|
||||
extern uint8_t dump_opt[256];
|
||||
|
||||
static char tab_prefix[4] = "\t\t\t";
|
||||
|
||||
static void
|
||||
|
||||
188
cmd/zhack.c
188
cmd/zhack.c
@ -54,6 +54,7 @@
|
||||
#include <sys/dmu_tx.h>
|
||||
#include <zfeature_common.h>
|
||||
#include <libzutil.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
|
||||
static importargs_t g_importargs;
|
||||
static char *g_pool;
|
||||
@ -93,7 +94,10 @@ usage(void)
|
||||
" -c repair corrupted label checksums\n"
|
||||
" -u restore the label on a detached device\n"
|
||||
"\n"
|
||||
" <device> : path to vdev\n");
|
||||
" <device> : path to vdev\n"
|
||||
"\n"
|
||||
" metaslab leak <pool>\n"
|
||||
" apply allocation map from zdb to specified pool\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -500,6 +504,186 @@ zhack_do_feature(int argc, char **argv)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
strstarts(const char *a, const char *b)
|
||||
{
|
||||
return (strncmp(a, b, strlen(b)) == 0);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
ASSERT(msp->ms_disabled);
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
uint64_t txg = dmu_tx_get_txg(tx);
|
||||
|
||||
uint64_t off = start;
|
||||
while (off < start + size) {
|
||||
uint64_t ostart, osize;
|
||||
boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable,
|
||||
off, start + size - off, &ostart, &osize);
|
||||
if (!found)
|
||||
break;
|
||||
zfs_range_tree_remove(msp->ms_allocatable, ostart, osize);
|
||||
|
||||
if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
|
||||
vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp,
|
||||
txg);
|
||||
|
||||
zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart,
|
||||
osize);
|
||||
msp->ms_allocating_total += osize;
|
||||
off = ostart + osize;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
zhack_do_metaslab_leak(int argc, char **argv)
|
||||
{
|
||||
int c;
|
||||
char *target;
|
||||
spa_t *spa;
|
||||
|
||||
optind = 1;
|
||||
boolean_t force = B_FALSE;
|
||||
while ((c = getopt(argc, argv, "f")) != -1) {
|
||||
switch (c) {
|
||||
case 'f':
|
||||
force = B_TRUE;
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (argc < 1) {
|
||||
(void) fprintf(stderr, "error: missing pool name\n");
|
||||
usage();
|
||||
}
|
||||
target = argv[0];
|
||||
|
||||
zhack_spa_open(target, B_FALSE, FTAG, &spa);
|
||||
spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER);
|
||||
|
||||
char *line = NULL;
|
||||
size_t cap = 0;
|
||||
|
||||
vdev_t *vd = NULL;
|
||||
metaslab_t *prev = NULL;
|
||||
dmu_tx_t *tx = NULL;
|
||||
while (getline(&line, &cap, stdin) > 0) {
|
||||
if (strstarts(line, "\tvdev ")) {
|
||||
uint64_t vdev_id, ms_shift;
|
||||
if (sscanf(line,
|
||||
"\tvdev %10"PRIu64"\t%*s metaslab shift %4"PRIu64,
|
||||
&vdev_id, &ms_shift) == 1) {
|
||||
VERIFY3U(sscanf(line, "\tvdev %"PRIu64
|
||||
"\t metaslab shift %4"PRIu64,
|
||||
&vdev_id, &ms_shift), ==, 2);
|
||||
}
|
||||
vd = vdev_lookup_top(spa, vdev_id);
|
||||
if (vd == NULL) {
|
||||
fprintf(stderr, "error: no such vdev with "
|
||||
"id %"PRIu64"\n", vdev_id);
|
||||
break;
|
||||
}
|
||||
if (tx) {
|
||||
dmu_tx_commit(tx);
|
||||
mutex_exit(&prev->ms_lock);
|
||||
metaslab_enable(prev, B_FALSE, B_FALSE);
|
||||
tx = NULL;
|
||||
prev = NULL;
|
||||
}
|
||||
if (vd->vdev_ms_shift != ms_shift) {
|
||||
fprintf(stderr, "error: ms_shift mismatch: %"
|
||||
PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift,
|
||||
ms_shift);
|
||||
break;
|
||||
}
|
||||
} else if (strstarts(line, "\tmetaslabs ")) {
|
||||
uint64_t ms_count;
|
||||
VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count),
|
||||
==, 1);
|
||||
ASSERT(vd);
|
||||
if (!force && vd->vdev_ms_count != ms_count) {
|
||||
fprintf(stderr, "error: ms_count mismatch: %"
|
||||
PRIu64" != %"PRIu64"\n", vd->vdev_ms_count,
|
||||
ms_count);
|
||||
break;
|
||||
}
|
||||
} else if (strstarts(line, "ALLOC:")) {
|
||||
uint64_t start, size;
|
||||
VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n",
|
||||
&start, &size), ==, 2);
|
||||
|
||||
ASSERT(vd);
|
||||
metaslab_t *cur =
|
||||
vd->vdev_ms[start >> vd->vdev_ms_shift];
|
||||
if (prev != cur) {
|
||||
if (prev) {
|
||||
dmu_tx_commit(tx);
|
||||
mutex_exit(&prev->ms_lock);
|
||||
metaslab_enable(prev, B_FALSE, B_FALSE);
|
||||
}
|
||||
ASSERT(cur);
|
||||
metaslab_disable(cur);
|
||||
mutex_enter(&cur->ms_lock);
|
||||
metaslab_load(cur);
|
||||
prev = cur;
|
||||
tx = dmu_tx_create_dd(
|
||||
spa_get_dsl(vd->vdev_spa)->dp_root_dir);
|
||||
dmu_tx_assign(tx, DMU_TX_WAIT);
|
||||
}
|
||||
|
||||
metaslab_force_alloc(cur, start, size, tx);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (tx) {
|
||||
dmu_tx_commit(tx);
|
||||
mutex_exit(&prev->ms_lock);
|
||||
metaslab_enable(prev, B_FALSE, B_FALSE);
|
||||
tx = NULL;
|
||||
prev = NULL;
|
||||
}
|
||||
if (line)
|
||||
free(line);
|
||||
|
||||
spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG);
|
||||
spa_close(spa, FTAG);
|
||||
}
|
||||
|
||||
static int
|
||||
zhack_do_metaslab(int argc, char **argv)
|
||||
{
|
||||
char *subcommand;
|
||||
|
||||
argc--;
|
||||
argv++;
|
||||
if (argc == 0) {
|
||||
(void) fprintf(stderr,
|
||||
"error: no metaslab operation specified\n");
|
||||
usage();
|
||||
}
|
||||
|
||||
subcommand = argv[0];
|
||||
if (strcmp(subcommand, "leak") == 0) {
|
||||
zhack_do_metaslab_leak(argc, argv);
|
||||
} else {
|
||||
(void) fprintf(stderr, "error: unknown subcommand: %s\n",
|
||||
subcommand);
|
||||
usage();
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
#define ASHIFT_UBERBLOCK_SHIFT(ashift) \
|
||||
MIN(MAX(ashift, UBERBLOCK_SHIFT), \
|
||||
MAX_UBERBLOCK_SHIFT)
|
||||
@ -1015,6 +1199,8 @@ main(int argc, char **argv)
|
||||
rv = zhack_do_feature(argc, argv);
|
||||
} else if (strcmp(subcommand, "label") == 0) {
|
||||
return (zhack_do_label(argc, argv));
|
||||
} else if (strcmp(subcommand, "metaslab") == 0) {
|
||||
rv = zhack_do_metaslab(argc, argv);
|
||||
} else {
|
||||
(void) fprintf(stderr, "error: unknown subcommand: %s\n",
|
||||
subcommand);
|
||||
|
||||
@ -604,5 +604,4 @@ class RaidzExpansionRunning(ZFSError):
|
||||
errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS
|
||||
message = "A raidz device is currently expanding"
|
||||
|
||||
|
||||
# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4
|
||||
|
||||
@ -122,6 +122,24 @@ Example:
|
||||
.Nm zhack Cm label repair Fl cu Ar device
|
||||
Fix checksums and undetach a device
|
||||
.
|
||||
.It Xo
|
||||
.Nm zhack
|
||||
.Cm metaslab leak
|
||||
.Op Fl f
|
||||
.Ar pool
|
||||
.Xc
|
||||
Apply a fragmentation profile generated by
|
||||
.Sy zdb
|
||||
to the specified
|
||||
.Ar pool Ns
|
||||
\&.
|
||||
.Pp
|
||||
The
|
||||
.Fl f
|
||||
flag forces the profile to apply even if the vdevs in the
|
||||
.Ar pool
|
||||
don't have the same number of metaslabs as the fragmentation profile.
|
||||
.
|
||||
.El
|
||||
.
|
||||
.Sh GLOBAL OPTIONS
|
||||
|
||||
@ -69,6 +69,13 @@
|
||||
.Op Fl U Ar cache
|
||||
.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns …
|
||||
.Nm
|
||||
.Fl -allocated-map
|
||||
.Op Fl mAFLPXY
|
||||
.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
|
||||
.Op Fl t Ar txg
|
||||
.Op Fl U Ar cache
|
||||
.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns …
|
||||
.Nm
|
||||
.Fl O
|
||||
.Op Fl K Ar key
|
||||
.Ar dataset path
|
||||
@ -128,6 +135,11 @@ that zdb may interpret inconsistent pool data and behave erratically.
|
||||
.Sh OPTIONS
|
||||
Display options:
|
||||
.Bl -tag -width Ds
|
||||
.It Fl Sy -allocated-map
|
||||
Prints out a list of all the allocated regions in the pool.
|
||||
Primarily intended for use with the
|
||||
.Nm zhack metaslab leak
|
||||
subcommand.
|
||||
.It Fl b , -block-stats
|
||||
Display statistics regarding the number, size
|
||||
.Pq logical, physical and allocated
|
||||
|
||||
@ -379,7 +379,7 @@ tags = ['functional', 'cli_root', 'zfs_wait']
|
||||
|
||||
[tests/functional/cli_root/zhack]
|
||||
tests = ['zhack_label_repair_001', 'zhack_label_repair_002',
|
||||
'zhack_label_repair_003', 'zhack_label_repair_004']
|
||||
'zhack_label_repair_003', 'zhack_label_repair_004', 'zhack_metaslab_leak']
|
||||
pre =
|
||||
post =
|
||||
tags = ['functional', 'cli_root', 'zhack']
|
||||
|
||||
@ -1009,6 +1009,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/cli_root/zhack/zhack_label_repair_002.ksh \
|
||||
functional/cli_root/zhack/zhack_label_repair_003.ksh \
|
||||
functional/cli_root/zhack/zhack_label_repair_004.ksh \
|
||||
functional/cli_root/zhack/zhack_metaslab_leak.ksh \
|
||||
functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \
|
||||
functional/cli_root/zpool_add/add-o_ashift.ksh \
|
||||
functional/cli_root/zpool_add/add_prop_ashift.ksh \
|
||||
|
||||
70
tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh
Executable file
70
tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh
Executable file
@ -0,0 +1,70 @@
|
||||
#!/bin/ksh
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Description:
|
||||
#
|
||||
# Test whether zhack metaslab leak functions correctly
|
||||
#
|
||||
# Strategy:
|
||||
#
|
||||
# 1. Create pool on a loopback device with some test data
|
||||
# 2. Gather pool capacity stats
|
||||
# 3. Generate fragmentation data with zdb
|
||||
# 4. Destroy the pool
|
||||
# 5. Create a new pool with the same configuration
|
||||
# 6. Export the pool
|
||||
# 7. Apply the fragmentation information with zhack metaslab leak
|
||||
# 8. Import the pool
|
||||
# 9. Verify that pool capacity stats match
|
||||
|
||||
. "$STF_SUITE"/include/libtest.shlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zpool destroy $TESTPOOL
|
||||
rm $tmp
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
log_assert "zhack metaslab leak leaks the right amount of space"
|
||||
|
||||
typeset tmp=$(mktemp)
|
||||
|
||||
log_must zpool create $TESTPOOL $DISKS
|
||||
for i in `seq 1 16`; do
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL/f$i bs=1M count=16
|
||||
log_must zpool sync $TESTPOOL
|
||||
done
|
||||
for i in `seq 2 2 16`; do
|
||||
log_must rm /$TESTPOOL/f$i
|
||||
done
|
||||
for i in `seq 1 16`; do
|
||||
log_must touch /$TESTPOOL/g$i
|
||||
log_must zpool sync $TESTPOOL
|
||||
done
|
||||
|
||||
alloc=$(zpool get -Hpo value alloc $TESTPOOL)
|
||||
log_must eval "zdb -m --allocated-map $TESTPOOL > $tmp"
|
||||
log_must zpool destroy $TESTPOOL
|
||||
|
||||
log_must zpool create $TESTPOOL $DISKS
|
||||
log_must zpool export $TESTPOOL
|
||||
log_must eval "zhack metaslab leak $TESTPOOL < $tmp"
|
||||
log_must zpool import $TESTPOOL
|
||||
|
||||
alloc2=$(zpool get -Hpo value alloc $TESTPOOL)
|
||||
|
||||
[[ $((alloc * 1.05)) -gt $alloc2 ]] && [[ $alloc -lt $alloc2 ]] || \
|
||||
log_fail "space usage changed too much: $alloc to $alloc2"
|
||||
|
||||
log_pass "zhack metaslab leak behaved correctly"
|
||||
Loading…
Reference in New Issue
Block a user