libspl/backtrace: comment and harden libunwind backtracer

This is the sort of code that we get right once and never look at again.
Anyone reading this code is already likely in the middle of a debugging
nightmare, and then they have a wall of manual string construction and
an unfamiliar and idiosyncratic library to deal with. So, comment the
whole thing to try to make it clear what's going on.

In pursuit of the above, I've added return checks to some of the
libunwind calls, fixed the frame loop to not skip the "top" frame
(however unseful it may be), and fix a couple of calls to
spl_bt_u64_to_hex_str() which requested 18 digits instead of 16.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16653
This commit is contained in:
Rob Norris 2024-10-18 15:10:33 +11:00 committed by Brian Behlendorf
parent 2596a75306
commit b85c564161

View File

@ -79,61 +79,177 @@ libspl_backtrace(int fd)
unw_cursor_t cp;
unw_word_t v;
char buf[128];
size_t n, c;
size_t n;
int err;
/* Snapshot the current frame and state. */
unw_getcontext(&uc);
unw_init_local(&cp, &uc);
/*
* TODO: walk back to the frame that tripped the assertion / the place
* where the signal was recieved.
*/
/*
* Register dump. We're going to loop over all the registers in the
* top frame, and show them, with names, in a nice three-column
* layout, which keeps us within 80 columns.
*/
spl_bt_write(fd, "Registers:\n");
c = 0;
/* Initialise a frame cursor, starting at the current frame */
unw_init_local(&cp, &uc);
/*
* libunwind's list of possible registers for this architecture is an
* enum, unw_regnum_t. UNW_TDEP_LAST_REG is the highest-numbered
* register in that list, however, not all register numbers in this
* range are defined by the architecture, and not all defined registers
* will be present on every implementation of that architecture.
* Moreover, libunwind provides nice names for most, but not all
* registers, but these are hardcoded; a name being available does not
* mean that register is available.
*
* So, we have to pull this all together here. We try to get the value
* of every possible register. If we get a value for it, then the
* register must exist, and so we get its name. If libunwind has no
* name for it, we synthesize something. These cases should be rare,
* and they're usually for uninteresting or niche registers, so it
* shouldn't really matter. We can see the value, and that's the main
* thing.
*/
uint_t cols = 0;
for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) {
/*
* Get the value. Any error probably means the register
* doesn't exist, and we skip it.
*/
if (unw_get_reg(&cp, regnum, &v) < 0)
continue;
/*
* Register name. If libunwind doesn't have a name for it,
* it will return "???". As a shortcut, we just treat '?'
* is an alternate end-of-string character.
*/
const char *name = unw_regname(regnum);
for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {}
if (n == 0) {
/*
* No valid name, so make one of the form "?xx", where
* "xx" is the two-char hex of libunwind's register
* number.
*/
buf[0] = '?';
n = spl_bt_u64_to_hex_str(regnum, 2,
&buf[1], sizeof (buf)-1) + 1;
name = buf;
}
/*
* Two spaces of padding before each column, plus extra
* spaces to align register names shorter than three chars.
*/
spl_bt_write_n(fd, " ", 5-MIN(n, 3));
/* Register name and column punctuation */
spl_bt_write_n(fd, name, n);
spl_bt_write(fd, ": 0x");
n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
/*
* Convert register value (from unw_get_reg()) to hex. We're
* assuming that all registers are 64-bits wide, which is
* probably fine for any general-purpose registers on any
* machine currently in use. A more generic way would be to
* look at the width of unw_word_t, but that would also
* complicate the column code a bit. This is fine.
*/
n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf));
spl_bt_write_n(fd, buf, n);
if (!(++c % 3))
/* Every third column, emit a newline */
if (!(++cols % 3))
spl_bt_write(fd, "\n");
}
if (c % 3)
/* If we finished before the third column, emit a newline. */
if (cols % 3)
spl_bt_write(fd, "\n");
unw_init_local(&cp, &uc);
/* Now the main event, the backtrace. */
spl_bt_write(fd, "Call trace:\n");
while (unw_step(&cp) > 0) {
unw_get_reg(&cp, UNW_REG_IP, &v);
/* Reset the cursor to the top again. */
unw_init_local(&cp, &uc);
do {
/*
* Getting the IP should never fail; libunwind handles it
* specially, because its used a lot internally. Still, no
* point being silly about it, as the last thing we want is
* our crash handler to crash. So if it ever does fail, we'll
* show an error line, but keep going to the next frame.
*/
if (unw_get_reg(&cp, UNW_REG_IP, &v) < 0) {
spl_bt_write(fd, " [couldn't get IP register; "
"corrupt frame?]");
continue;
}
/* IP & punctuation */
n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf));
spl_bt_write(fd, " [0x");
n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
spl_bt_write_n(fd, buf, n);
spl_bt_write(fd, "] ");
unw_get_proc_name(&cp, buf, sizeof (buf), &v);
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
spl_bt_write_n(fd, buf, n);
spl_bt_write(fd, "+0x");
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
spl_bt_write_n(fd, buf, n);
/*
* Function ("procedure") name for the current frame. `v`
* receives the offset from the named function to the IP, which
* we show as a "+offset" suffix.
*
* If libunwind can't determine the name, we just show "???"
* instead. We've already displayed the IP above; that will
* have to do.
*
* unw_get_proc_name() will return ENOMEM if the buffer is too
* small, instead truncating the name. So we treat that as a
* success and use whatever is in the buffer.
*/
err = unw_get_proc_name(&cp, buf, sizeof (buf), &v);
if (err == 0 || err == -UNW_ENOMEM) {
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
spl_bt_write_n(fd, buf, n);
/* Offset from proc name */
spl_bt_write(fd, "+0x");
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
spl_bt_write_n(fd, buf, n);
} else
spl_bt_write(fd, "???");
#ifdef HAVE_LIBUNWIND_ELF
spl_bt_write(fd, " (in ");
unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
spl_bt_write_n(fd, buf, n);
spl_bt_write(fd, " +0x");
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
spl_bt_write_n(fd, buf, n);
spl_bt_write(fd, ")");
/*
* Newer libunwind has unw_get_elf_filename(), which gets
* the name of the ELF object that the frame was executing in.
* Like `unw_get_proc_name()`, `v` recieves the offset within
* the file, and UNW_ENOMEM indicates that a truncate filename
* was left in the buffer.
*/
err = unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
if (err == 0 || err == -UNW_ENOMEM) {
for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
spl_bt_write(fd, " (in ");
spl_bt_write_n(fd, buf, n);
/* Offset within file */
spl_bt_write(fd, " +0x");
n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
spl_bt_write_n(fd, buf, n);
spl_bt_write(fd, ")");
}
#endif
spl_bt_write(fd, "\n");
}
} while (unw_step(&cp) > 0);
}
#elif defined(HAVE_BACKTRACE)
#include <execinfo.h>