Refactor zhack label repair and fix -c regression on nonzero TXG

This commit fixes a likely regression introduced by 64db435 where the
checksum repair functionality (`-c` or default behavior) will perform
checks and access data associated with the newer undetach (`-u`)
functionality, resulting in a failure when an uberblock's TXG is not 0
as required by `-u` but not `-c`

Additionally, code is refactored for better separation of tasks.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: buzzingwires <buzzingwires@outlook.com>
Closes #17732
This commit is contained in:
buzzingwires 2025-09-12 06:48:41 -04:00 committed by Brian Behlendorf
parent e3eb3ca3dc
commit 5f7253ca11
3 changed files with 110 additions and 68 deletions

View File

@ -714,6 +714,23 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl,
return (0); return (0);
} }
static int
zhack_repair_get_byteswap(const zio_eck_t *vdev_eck, const int l, int *byteswap)
{
if (vdev_eck->zec_magic == ZEC_MAGIC) {
*byteswap = B_FALSE;
} else if (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)) {
*byteswap = B_TRUE;
} else {
(void) fprintf(stderr, "error: label %d: "
"Expected the nvlist checksum magic number but instead got "
"0x%" PRIx64 "\n",
l, vdev_eck->zec_magic);
return (1);
}
return (0);
}
static void static void
zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
@ -740,33 +757,10 @@ zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
} }
static int static int
zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, zhack_repair_get_ashift(nvlist_t *cfg, const int l, uint64_t *ashift)
const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg,
uint64_t *ashift)
{ {
int err; int err;
nvlist_t *vdev_tree_cfg;
if (ub->ub_txg != 0) {
(void) fprintf(stderr,
"error: label %d: UB TXG of 0 expected, but got %"
PRIu64 "\n",
l, ub->ub_txg);
(void) fprintf(stderr, "It would appear the device was not "
"properly removed.\n");
return (1);
}
for (int i = 0; i < cfg_keys_len; i++) {
uint64_t val;
err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val);
if (err) {
(void) fprintf(stderr,
"error: label %d, %d: "
"cannot find nvlist key %s\n",
l, i, cfg_keys[i]);
return (err);
}
}
err = nvlist_lookup_nvlist(cfg, err = nvlist_lookup_nvlist(cfg,
ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg);
@ -790,7 +784,7 @@ zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys,
(void) fprintf(stderr, (void) fprintf(stderr,
"error: label %d: nvlist key %s is zero\n", "error: label %d: nvlist key %s is zero\n",
l, ZPOOL_CONFIG_ASHIFT); l, ZPOOL_CONFIG_ASHIFT);
return (err); return (1);
} }
return (0); return (0);
@ -805,30 +799,35 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
*/ */
if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) { if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp); const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
int err;
ub->ub_txg = txg; ub->ub_txg = txg;
if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { err = nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG);
if (err) {
(void) fprintf(stderr, (void) fprintf(stderr,
"error: label %d: " "error: label %d: "
"Failed to remove pool creation TXG\n", "Failed to remove pool creation TXG\n",
l); l);
return (1); return (err);
} }
if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { err = nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG);
if (err) {
(void) fprintf(stderr, (void) fprintf(stderr,
"error: label %d: Failed to remove pool TXG to " "error: label %d: Failed to remove pool TXG to "
"be replaced.\n", "be replaced.\n",
l); l);
return (1); return (err);
} }
if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { err = nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg);
if (err) {
(void) fprintf(stderr, (void) fprintf(stderr,
"error: label %d: " "error: label %d: "
"Failed to add pool TXG of %" PRIu64 "\n", "Failed to add pool TXG of %" PRIu64 "\n",
l, txg); l, txg);
return (1); return (err);
} }
} }
@ -922,6 +921,7 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
const uint64_t actual_magic = vdev_eck->zec_magic; const uint64_t actual_magic = vdev_eck->zec_magic;
int err = 0; int err = 0;
if (actual_magic != expected_magic) { if (actual_magic != expected_magic) {
(void) fprintf(stderr, "error: label %d: " (void) fprintf(stderr, "error: label %d: "
"Expected " "Expected "
@ -943,6 +943,36 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
return (err); return (err);
} }
static int
zhack_repair_unpack_cfg(vdev_label_t *vl, const int l, nvlist_t **cfg)
{
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
int err;
err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist,
VDEV_PHYS_SIZE - sizeof (zio_eck_t), cfg, 0);
if (err) {
(void) fprintf(stderr,
"error: cannot unpack nvlist label %d\n", l);
return (err);
}
for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) {
uint64_t val;
err = nvlist_lookup_uint64(*cfg, cfg_keys[i], &val);
if (err) {
(void) fprintf(stderr,
"error: label %d, %d: "
"cannot find nvlist key %s\n",
l, i, cfg_keys[i]);
return (err);
}
}
return (0);
}
static void static void
zhack_repair_one_label(const zhack_repair_op_t op, const int fd, zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
vdev_label_t *vl, const uint64_t label_offset, const int l, vdev_label_t *vl, const uint64_t label_offset, const int l,
@ -956,10 +986,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
(zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
const uint64_t vdev_phys_offset = const uint64_t vdev_phys_offset =
label_offset + offsetof(vdev_label_t, vl_vdev_phys); label_offset + offsetof(vdev_label_t, vl_vdev_phys);
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
nvlist_t *cfg; nvlist_t *cfg;
nvlist_t *vdev_tree_cfg = NULL;
uint64_t ashift; uint64_t ashift;
int byteswap; int byteswap;
@ -967,18 +994,9 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
if (err) if (err)
return; return;
if (vdev_eck->zec_magic == 0) { err = zhack_repair_get_byteswap(vdev_eck, l, &byteswap);
(void) fprintf(stderr, "error: label %d: " if (err)
"Expected the nvlist checksum magic number to not be zero"
"\n",
l);
(void) fprintf(stderr, "There should already be a checksum "
"for the label.\n");
return; return;
}
byteswap =
(vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
if (byteswap) { if (byteswap) {
byteswap_uint64_array(&vdev_eck->zec_cksum, byteswap_uint64_array(&vdev_eck->zec_cksum,
@ -994,16 +1012,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
return; return;
} }
err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, err = zhack_repair_unpack_cfg(vl, l, &cfg);
VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0);
if (err) {
(void) fprintf(stderr,
"error: cannot unpack nvlist label %d\n", l);
return;
}
err = zhack_repair_check_label(ub,
l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
if (err) if (err)
return; return;
@ -1011,6 +1020,19 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
char *buf; char *buf;
size_t buflen; size_t buflen;
if (ub->ub_txg != 0) {
(void) fprintf(stderr,
"error: label %d: UB TXG of 0 expected, but got %"
PRIu64 "\n", l, ub->ub_txg);
(void) fprintf(stderr, "It would appear the device was "
"not properly detached.\n");
return;
}
err = zhack_repair_get_ashift(cfg, l, &ashift);
if (err)
return;
err = zhack_repair_undetach(ub, cfg, l); err = zhack_repair_undetach(ub, cfg, l);
if (err) if (err)
return; return;

View File

@ -33,13 +33,16 @@
# Test one: # Test one:
# #
# 1. Create pool on a loopback device with some test data # 1. Create pool on a loopback device with some test data
# 2. Export the pool. # 2. Checksum repair should work with a valid TXG. Repeatedly write and
# 3. Corrupt all label checksums in the pool # sync the pool so there are enough transactions for every uberblock
# 4. Check that pool cannot be imported # to have a TXG
# 5. Verify that it cannot be imported after using zhack label repair -u # 3. Export the pool.
# 4. Corrupt all label checksums in the pool
# 5. Check that pool cannot be imported
# 6. Verify that it cannot be imported after using zhack label repair -u
# to ensure that the -u option will quit on corrupted checksums. # to ensure that the -u option will quit on corrupted checksums.
# 6. Use zhack label repair -c on device # 7. Use zhack label repair -c on device
# 7. Check that pool can be imported and that data is intact # 8. Check that pool can be imported and that data is intact
# #
# Test two: # Test two:
# #
@ -170,6 +173,17 @@ function setup_dataset
check_dataset true check_dataset true
} }
function force_transactions
{
L_TIMES="$1"
for ((i=0; i < L_TIMES; i++))
do
touch "$TESTDIR"/"test" || return $?
zpool sync -f "$TESTPOOL" || return $?
done
return 0
}
function get_practical_size function get_practical_size
{ {
L_SIZE="$1" L_SIZE="$1"
@ -257,6 +271,9 @@ function run_test_one
setup_dataset setup_dataset
# Force 256 extra transactions to ensure all uberblocks are assigned a TXG
log_must force_transactions 256
log_must zpool export "$TESTPOOL" log_must zpool export "$TESTPOOL"
corrupt_labels "$L_SIZE" "$VIRTUAL_DISK" corrupt_labels "$L_SIZE" "$VIRTUAL_DISK"

View File

@ -18,13 +18,16 @@
# Strategy: # Strategy:
# #
# 1. Create pool on a loopback device with some test data # 1. Create pool on a loopback device with some test data
# 2. Export the pool. # 2. Checksum repair should work with a valid TXG. Repeatedly write and
# 3. Corrupt all label checksums in the pool # sync the pool so there are enough transactions for every uberblock
# 4. Check that pool cannot be imported # to have a TXG
# 5. Verify that it cannot be imported after using zhack label repair -u # 3. Export the pool.
# 4. Corrupt all label checksums in the pool
# 5. Check that pool cannot be imported
# 6. Verify that it cannot be imported after using zhack label repair -u
# to ensure that the -u option will quit on corrupted checksums. # to ensure that the -u option will quit on corrupted checksums.
# 6. Use zhack label repair -c on device # 7. Use zhack label repair -c on device
# 7. Check that pool can be imported and that data is intact # 8. Check that pool can be imported and that data is intact
. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib . "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib