Refactor zhack label repair and fix -c regression on nonzero TXG

This commit fixes a likely regression introduced by 64db435 where the
checksum repair functionality (`-c` or default behavior) will perform
checks and access data associated with the newer undetach (`-u`)
functionality, resulting in a failure when an uberblock's TXG is not 0
as required by `-u` but not `-c`

Additionally, code is refactored for better separation of tasks.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: buzzingwires <buzzingwires@outlook.com>
Closes #17732
This commit is contained in:
buzzingwires 2025-09-12 06:48:41 -04:00 committed by Brian Behlendorf
parent e3eb3ca3dc
commit 5f7253ca11
3 changed files with 110 additions and 68 deletions

View File

@ -714,6 +714,23 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl,
return (0);
}
static int
zhack_repair_get_byteswap(const zio_eck_t *vdev_eck, const int l, int *byteswap)
{
if (vdev_eck->zec_magic == ZEC_MAGIC) {
*byteswap = B_FALSE;
} else if (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)) {
*byteswap = B_TRUE;
} else {
(void) fprintf(stderr, "error: label %d: "
"Expected the nvlist checksum magic number but instead got "
"0x%" PRIx64 "\n",
l, vdev_eck->zec_magic);
return (1);
}
return (0);
}
static void
zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
@ -740,33 +757,10 @@ zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
}
static int
zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys,
const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg,
uint64_t *ashift)
zhack_repair_get_ashift(nvlist_t *cfg, const int l, uint64_t *ashift)
{
int err;
if (ub->ub_txg != 0) {
(void) fprintf(stderr,
"error: label %d: UB TXG of 0 expected, but got %"
PRIu64 "\n",
l, ub->ub_txg);
(void) fprintf(stderr, "It would appear the device was not "
"properly removed.\n");
return (1);
}
for (int i = 0; i < cfg_keys_len; i++) {
uint64_t val;
err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val);
if (err) {
(void) fprintf(stderr,
"error: label %d, %d: "
"cannot find nvlist key %s\n",
l, i, cfg_keys[i]);
return (err);
}
}
nvlist_t *vdev_tree_cfg;
err = nvlist_lookup_nvlist(cfg,
ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg);
@ -790,7 +784,7 @@ zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys,
(void) fprintf(stderr,
"error: label %d: nvlist key %s is zero\n",
l, ZPOOL_CONFIG_ASHIFT);
return (err);
return (1);
}
return (0);
@ -805,30 +799,35 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
*/
if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
int err;
ub->ub_txg = txg;
if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {
err = nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG);
if (err) {
(void) fprintf(stderr,
"error: label %d: "
"Failed to remove pool creation TXG\n",
l);
return (1);
return (err);
}
if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) {
err = nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG);
if (err) {
(void) fprintf(stderr,
"error: label %d: Failed to remove pool TXG to "
"be replaced.\n",
l);
return (1);
return (err);
}
if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) {
err = nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg);
if (err) {
(void) fprintf(stderr,
"error: label %d: "
"Failed to add pool TXG of %" PRIu64 "\n",
l, txg);
return (1);
return (err);
}
}
@ -922,6 +921,7 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
const uint64_t actual_magic = vdev_eck->zec_magic;
int err = 0;
if (actual_magic != expected_magic) {
(void) fprintf(stderr, "error: label %d: "
"Expected "
@ -943,6 +943,36 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
return (err);
}
static int
zhack_repair_unpack_cfg(vdev_label_t *vl, const int l, nvlist_t **cfg)
{
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
int err;
err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist,
VDEV_PHYS_SIZE - sizeof (zio_eck_t), cfg, 0);
if (err) {
(void) fprintf(stderr,
"error: cannot unpack nvlist label %d\n", l);
return (err);
}
for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) {
uint64_t val;
err = nvlist_lookup_uint64(*cfg, cfg_keys[i], &val);
if (err) {
(void) fprintf(stderr,
"error: label %d, %d: "
"cannot find nvlist key %s\n",
l, i, cfg_keys[i]);
return (err);
}
}
return (0);
}
static void
zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
vdev_label_t *vl, const uint64_t label_offset, const int l,
@ -956,10 +986,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
(zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
const uint64_t vdev_phys_offset =
label_offset + offsetof(vdev_label_t, vl_vdev_phys);
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
nvlist_t *cfg;
nvlist_t *vdev_tree_cfg = NULL;
uint64_t ashift;
int byteswap;
@ -967,18 +994,9 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
if (err)
return;
if (vdev_eck->zec_magic == 0) {
(void) fprintf(stderr, "error: label %d: "
"Expected the nvlist checksum magic number to not be zero"
"\n",
l);
(void) fprintf(stderr, "There should already be a checksum "
"for the label.\n");
err = zhack_repair_get_byteswap(vdev_eck, l, &byteswap);
if (err)
return;
}
byteswap =
(vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
if (byteswap) {
byteswap_uint64_array(&vdev_eck->zec_cksum,
@ -994,16 +1012,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
return;
}
err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist,
VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0);
if (err) {
(void) fprintf(stderr,
"error: cannot unpack nvlist label %d\n", l);
return;
}
err = zhack_repair_check_label(ub,
l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
err = zhack_repair_unpack_cfg(vl, l, &cfg);
if (err)
return;
@ -1011,6 +1020,19 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
char *buf;
size_t buflen;
if (ub->ub_txg != 0) {
(void) fprintf(stderr,
"error: label %d: UB TXG of 0 expected, but got %"
PRIu64 "\n", l, ub->ub_txg);
(void) fprintf(stderr, "It would appear the device was "
"not properly detached.\n");
return;
}
err = zhack_repair_get_ashift(cfg, l, &ashift);
if (err)
return;
err = zhack_repair_undetach(ub, cfg, l);
if (err)
return;

View File

@ -33,13 +33,16 @@
# Test one:
#
# 1. Create pool on a loopback device with some test data
# 2. Export the pool.
# 3. Corrupt all label checksums in the pool
# 4. Check that pool cannot be imported
# 5. Verify that it cannot be imported after using zhack label repair -u
# 2. Checksum repair should work with a valid TXG. Repeatedly write and
# sync the pool so there are enough transactions for every uberblock
# to have a TXG
# 3. Export the pool.
# 4. Corrupt all label checksums in the pool
# 5. Check that pool cannot be imported
# 6. Verify that it cannot be imported after using zhack label repair -u
# to ensure that the -u option will quit on corrupted checksums.
# 6. Use zhack label repair -c on device
# 7. Check that pool can be imported and that data is intact
# 7. Use zhack label repair -c on device
# 8. Check that pool can be imported and that data is intact
#
# Test two:
#
@ -170,6 +173,17 @@ function setup_dataset
check_dataset true
}
function force_transactions
{
L_TIMES="$1"
for ((i=0; i < L_TIMES; i++))
do
touch "$TESTDIR"/"test" || return $?
zpool sync -f "$TESTPOOL" || return $?
done
return 0
}
function get_practical_size
{
L_SIZE="$1"
@ -257,6 +271,9 @@ function run_test_one
setup_dataset
# Force 256 extra transactions to ensure all uberblocks are assigned a TXG
log_must force_transactions 256
log_must zpool export "$TESTPOOL"
corrupt_labels "$L_SIZE" "$VIRTUAL_DISK"

View File

@ -18,13 +18,16 @@
# Strategy:
#
# 1. Create pool on a loopback device with some test data
# 2. Export the pool.
# 3. Corrupt all label checksums in the pool
# 4. Check that pool cannot be imported
# 5. Verify that it cannot be imported after using zhack label repair -u
# 2. Checksum repair should work with a valid TXG. Repeatedly write and
# sync the pool so there are enough transactions for every uberblock
# to have a TXG
# 3. Export the pool.
# 4. Corrupt all label checksums in the pool
# 5. Check that pool cannot be imported
# 6. Verify that it cannot be imported after using zhack label repair -u
# to ensure that the -u option will quit on corrupted checksums.
# 6. Use zhack label repair -c on device
# 7. Check that pool can be imported and that data is intact
# 7. Use zhack label repair -c on device
# 8. Check that pool can be imported and that data is intact
. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib