From 3142337e626110754f8dcee6cda409fc10c5156c Mon Sep 17 00:00:00 2001 From: Gregory Lirent Date: Wed, 24 Jun 2026 14:26:50 +0300 Subject: [PATCH] fix(discovery): derive the below-4G split robustly from fragmented mtree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit host_probe derived the guest's below-4G split (vmie `low`) by taking the first GPA-0 RAM run in `info mtree -f`. When low RAM is fragmented by overlay pages (Hyper-V SynIC) and blackhole holes (smbase/tseg), that first run is a tiny fragment, so the split came out far too small and host_bootstrap could never recover the System DTB — the memctx context was never published. Extract a pure parser, mtree_low_split(): anchor on the system flatview, take `low` from the @file-offset of the high-RAM region at GPA >= 4 GiB (which equals the split by construction), cross-validate against the PCI-hole base, and fail closed when it can't be derived. QMP-reply un-escaping moves to the transport boundary so the parser works on plain text. Unit-tested against a synthetic fragmented flatview including a decoy non-system address space. postinst also hints to restart the daemon after an upgrade (a running instance keeps the old build until restarted). Bump 0.3.6. --- CMakeLists.txt | 12 +- packaging/deb/vmsig/postinst | 12 +- src/discovery/include/mtree.h | 12 ++ src/discovery/linux/host_probe.c | 59 ++++--- src/discovery/linux/mtree.c | 177 +++++++++++++++++++ src/test/fixtures/mtree_split_fragmented.txt | 42 +++++ src/test/test_mtree.c | 83 +++++++++ 7 files changed, 372 insertions(+), 25 deletions(-) create mode 100644 src/discovery/include/mtree.h create mode 100644 src/discovery/linux/mtree.c create mode 100644 src/test/fixtures/mtree_split_fragmented.txt create mode 100644 src/test/test_mtree.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bc1a1b..28f22d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.16) # Single source of truth for the version: CI passes -DVMSIG_VERSION=${TAG#v}, so the project # version (-> libvgpu-perception SONAME/.so version) and the .deb version come from one tag. -set(VMSIG_VERSION "0.3.5" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") +set(VMSIG_VERSION "0.3.6" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") project(vmsig VERSION ${VMSIG_VERSION} LANGUAGES C) set(CMAKE_C_STANDARD 17) @@ -65,6 +65,7 @@ add_library(vmsig SHARED src/control/socket.c src/discovery/slot.c src/discovery/linux/host_probe.c + src/discovery/linux/mtree.c src/discovery/discovery.c # SI input driver (vmctl), absorbed in-tree (host-only: QMP + uinput) src/si/input/open.c @@ -216,6 +217,15 @@ target_include_directories(vmsig_discoverytest PRIVATE target_compile_options(vmsig_discoverytest PRIVATE -Wall -Wextra) add_test(NAME discovery COMMAND vmsig_discoverytest) +add_executable(vmsig_mtreetest src/test/test_mtree.c) +target_link_libraries(vmsig_mtreetest PRIVATE vmsig) +target_include_directories(vmsig_mtreetest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/discovery/include) +target_compile_definitions(vmsig_mtreetest PRIVATE + FIXTURE_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/test/fixtures") +target_compile_options(vmsig_mtreetest PRIVATE -Wall -Wextra) +add_test(NAME mtree COMMAND vmsig_mtreetest) + add_executable(vmsig_daemoncfgtest src/test/test_daemoncfg.c src/daemon/config.c diff --git a/packaging/deb/vmsig/postinst b/packaging/deb/vmsig/postinst index 9a46051..c7d433e 100755 --- a/packaging/deb/vmsig/postinst +++ b/packaging/deb/vmsig/postinst @@ -11,7 +11,17 @@ configure) systemd-tmpfiles --create /usr/lib/tmpfiles.d/vmsig.conf || true systemctl enable vmsigd.service || true # enable, but do NOT start fi - echo "vmsig: review the [grant] policy in /etc/vmsig/vmsigd.conf, then: systemctl start vmsigd" >&2 + if [ -z "$2" ]; then + # fresh install ($2 empty): enabled but NOT started — the operator reviews the + # grant policy before the first start. + echo "vmsig: review the [grant] policy in /etc/vmsig/vmsigd.conf, then: systemctl start vmsigd" >&2 + else + # upgrade ($2 = old version): a running daemon keeps the OLD in-memory image until + # restarted — the new build is not applied automatically. Not auto-restarted here: + # the start is gated on the grant policy, so the operator owns the moment. try-restart + # touches the daemon only if it is currently running (leaves a stopped one alone). + echo "vmsig: upgraded from $2 — a running daemon still runs the old build; apply with: systemctl try-restart vmsigd" >&2 + fi ;; abort-upgrade|abort-remove|abort-deconfigure) ;; diff --git a/src/discovery/include/mtree.h b/src/discovery/include/mtree.h new file mode 100644 index 0000000..1a7bb98 --- /dev/null +++ b/src/discovery/include/mtree.h @@ -0,0 +1,12 @@ +#ifndef VMSIG_MTREE_H +#define VMSIG_MTREE_H + +#include + +/* Derive the below-4G split (== vmie `low`: low-RAM GPA bound AND high-RAM file offset) + * from `info mtree -f` text. Operates on the system flatview only. FAIL-CLOSED: 0 if the + * split cannot be derived with confidence. `text` is plain UTF-8 with real '\n' + * (the caller un-escapes the QMP JSON string first). */ +uint64_t mtree_low_split(const char* text); + +#endif /* VMSIG_MTREE_H */ diff --git a/src/discovery/linux/host_probe.c b/src/discovery/linux/host_probe.c index f71ab48..c7a9b0a 100644 --- a/src/discovery/linux/host_probe.c +++ b/src/discovery/linux/host_probe.c @@ -5,6 +5,7 @@ * leaves ok=0 (the VM is not brought up rather than guessed). */ #define _GNU_SOURCE #include "host_probe.h" +#include "mtree.h" /* mtree_low_split */ #include "vmsig_event.h" /* VMSIG_VM_* */ #include #include @@ -174,29 +175,39 @@ static int qmp_status_word(const char* buf) { return VMSIG_VM_UNKNOWN; } -/* Derive the below-4G split from `info mtree` text: the size of the RAM region whose guest - * physical range starts at address 0. Standard QEMU split-RAM layout puts low RAM at - * [0, low) and high RAM above 4G at file offset @low. FAIL-CLOSED: 0 if not found. - * NOTE: parses HMP text (not a stable QMP schema) — verify against real `info mtree` output. */ -static uint64_t mtree_low(const char* ret) { - /* The return is a JSON string; lines inside are escaped "\n". Scan for the GPA-0 ram run: - * " 0000000000000000- (prio N, ram): ..." */ - const char* p = ret; - while ((p = strstr(p, "0000000000000000-")) != NULL) { - const char* end_hex = p + 17; /* 16 zeros + '-' */ - char* stop = NULL; - unsigned long long end = strtoull(end_hex, &stop, 16); - /* the descriptor after the range must mark it RAM (not the i/o "system" root) */ - const char* tail = stop ? stop : end_hex; - const char* nl = strstr(tail, "\\n"); - const char* lim = nl ? nl : (tail + 64); - int is_ram = 0; - for (const char* q = tail; q < lim && *q; q++) - if (!strncmp(q, "ram)", 4)) { is_ram = 1; break; } - if (is_ram && end > 0 && end != ~0ull) return end + 1ull; /* [0, end] => low=end+1 */ - p = end_hex; +/* Extract the JSON string value of "return" from an HMP-over-QMP reply and decode its + * transport escapes (\n \t \" \\) in place into a NUL-terminated plain-text buffer. The + * `info mtree -f` output is one JSON string with embedded escaped newlines; un-escaping is + * a transport detail of HMP-over-QMP and belongs here (next to the QMP code), so the split + * parser (mtree_low_split) can work on human-readable text with real '\n'. The decode never + * grows the buffer (every escape shortens it), so it writes into `out` (>= strlen(buf)+1). + * Returns 1 on success, 0 if no "return" string is present. */ +static int qmp_return_plain(const char* buf, char* out, size_t cap) { + const char* r = strstr(buf, "\"return\""); + if (!r) return 0; + r = strchr(r, ':'); if (!r) return 0; + r = strchr(r, '"'); if (!r) return 0; /* opening quote of the string value */ + r++; + size_t o = 0; + for (; *r && o + 1 < cap; r++) { + char c = *r; + if (c == '"') break; /* closing quote */ + if (c == '\\' && r[1]) { + r++; + switch (*r) { + case 'n': c = '\n'; break; + case 't': c = '\t'; break; + case 'r': c = '\r'; break; + case '"': c = '"'; break; + case '\\': c = '\\'; break; + case '/': c = '/'; break; + default: c = *r; break; /* unknown escape: take it literally */ + } + } + out[o++] = c; } - return 0; + out[o] = 0; + return 1; } static int hp_live(const struct vmsig_host_probe* p, vmsig_host_facts* io) { @@ -221,7 +232,9 @@ static int hp_live(const struct vmsig_host_probe* p, vmsig_host_facts* io) { if (qmp_cmd(fd, "{\"execute\":\"human-monitor-command\"," "\"arguments\":{\"command-line\":\"info mtree -f\"}}\n", buf, 256 * 1024) == 1) { - io->low = mtree_low(buf); + /* un-escape the HMP string in place (it only shrinks), then parse the split */ + if (qmp_return_plain(buf, buf, 256 * 1024)) + io->low = mtree_low_split(buf); } } diff --git a/src/discovery/linux/mtree.c b/src/discovery/linux/mtree.c new file mode 100644 index 0000000..1a49d1d --- /dev/null +++ b/src/discovery/linux/mtree.c @@ -0,0 +1,177 @@ +/* mtree.c — derive the below-4G split (vmie `low`) from `info mtree -f` text. + * + * `low` is one number with two meanings (see vmie low_segs): the GPA bound of low-RAM + * ([0,low) maps 1:1 to file[0,low)) AND the file offset at which RAM resumes above 4 GiB + * (GPA 4GiB -> file[low]). The robust signal for it is therefore the `@` suffix + * of the high-RAM ram region (GPA >= 4 GiB): that offset IS `low` by construction. + * + * Low-RAM below 4 GiB is fragmented (Hyper-V synic overlays, smbase/tseg blackhole i/o + * holes, rom holes), so "end of the first contiguous ram run" is NOT a reliable split. + * We never trust it. Primary signal: high-RAM `@offset`. Cross-validator / fallback: + * the start GPA of the first non-ram region at or above the standard PCI-hole base + * (0x80000000) — the bottom of the 4 GiB PCI hole, which equals `low` for the classic + * single-`low` layout. The two must agree when both are present; otherwise fail-closed. + * + * Pure text, line by line, no allocation beyond the input, no I/O. FAIL-CLOSED: any + * unexpected/incomplete input yields 0 ("not found"); 0 is reserved for that. */ +#include "mtree.h" +#include +#include + +/* Standard QEMU/i440fx/q35 PCI-hole base (bottom of the 4 GiB hole). Used ONLY as the + * lower cutoff for the cross-validator/fallback, never hardcoded as the answer. */ +#define PCI_HOLE_BASE 0x80000000ull +/* 4 GiB: high-RAM (the ram region carrying `@low`) starts at or above this GPA. */ +#define RAM_HIGH_BASE 0x100000000ull + +/* Parse exactly `n` hex digits at p into *out. Returns the char past the last digit, or + * NULL if there are not n hex digits (no partial consume). */ +static const char* parse_hexn(const char* p, int n, uint64_t* out) { + uint64_t v = 0; + for (int i = 0; i < n; i++) { + char c = p[i]; + unsigned d; + if (c >= '0' && c <= '9') d = (unsigned)(c - '0'); + else if (c >= 'a' && c <= 'f') d = (unsigned)(c - 'a' + 10); + else if (c >= 'A' && c <= 'F') d = (unsigned)(c - 'A' + 10); + else return NULL; + v = (v << 4) | d; + } + *out = v; + return p + n; +} + +/* One region line of a flatview body, e.g. + * " 0000000100000000-000000027fffffff (prio 0, ram): ram0 @0000000080000000 KVM" + * Two leading spaces, 16-hex start, '-', 16-hex end, " (prio , ): ". + * Fills *start_gpa, *is_ram and, when present in , *file_off (with *has_off=1). + * Returns 1 on a well-formed region line, 0 otherwise (not a region line for us). */ +typedef struct { + uint64_t start_gpa; + int is_ram; /* flag is exactly "ram" (not ramd/romd/rom/i/o/container) */ + int has_off; /* a "@" suffix was present in the descriptor */ + uint64_t file_off; /* value of that suffix */ +} region_line; + +static int parse_region_line(const char* line, const char* nl, region_line* out) { + /* leading " " then 16 hex, '-', 16 hex */ + if (line[0] != ' ' || line[1] != ' ') return 0; + const char* p = line + 2; + uint64_t start, end; + p = parse_hexn(p, 16, &start); + if (!p || *p != '-') return 0; + p++; + p = parse_hexn(p, 16, &end); + if (!p) return 0; + + /* " (prio , ):" — find the flag between ", " and ")". */ + if (strncmp(p, " (prio ", 7) != 0) return 0; + const char* comma = memchr(p, ',', (size_t)(nl - p)); + if (!comma) return 0; + const char* flag = comma + 1; + while (flag < nl && *flag == ' ') flag++; + const char* rparen = memchr(flag, ')', (size_t)(nl - flag)); + if (!rparen) return 0; + size_t flen = (size_t)(rparen - flag); + + out->start_gpa = start; + out->is_ram = (flen == 3 && strncmp(flag, "ram", 3) == 0) ? 1 : 0; + + /* optional "@" anywhere in the descriptor tail (after "): "). */ + out->has_off = 0; + out->file_off = 0; + const char* at = memchr(rparen, '@', (size_t)(nl - rparen)); + if (at) { + char* stop = NULL; + unsigned long long v = strtoull(at + 1, &stop, 16); + if (stop && stop != at + 1) { out->has_off = 1; out->file_off = (uint64_t)v; } + } + return 1; +} + +/* Locate the system flatview body: the lines AFTER " Root memory region: system" up to + * the next "FlatView #" (or EOF). Returns the body start, sets *body_end; NULL if absent. */ +static const char* find_system_flatview(const char* text, const char** body_end) { + const char* anchor = "Root memory region: system"; + const char* p = text; + while ((p = strstr(p, anchor)) != NULL) { + /* The root name must end the token (newline/EOF) — reject "system.flash0" etc., + * and reject roots that merely contain the word elsewhere. */ + const char* after = p + strlen(anchor); + if (*after == '\n' || *after == '\0' || *after == ' ') { + const char* body = strchr(p, '\n'); + if (!body) return NULL; + body++; /* first region line */ + const char* fv = strstr(body, "\nFlatView #"); + *body_end = fv ? fv + 1 : (body + strlen(body)); + return body; + } + p = after; + } + return NULL; +} + +/* Primary signal: file offset (`@hex`) of the first ram region whose start GPA >= 4 GiB. + * Returns 1 and sets *off when found, 0 otherwise. */ +static int high_ram_offset(const char* body, const char* end, uint64_t* off) { + const char* p = body; + while (p < end) { + const char* nl = memchr(p, '\n', (size_t)(end - p)); + const char* line_end = nl ? nl : end; + region_line r; + if (parse_region_line(p, line_end, &r) && + r.is_ram && r.start_gpa >= RAM_HIGH_BASE && r.has_off) { + *off = r.file_off; + return 1; + } + if (!nl) break; + p = nl + 1; + } + return 0; +} + +/* Cross-validator / fallback: start GPA of the first non-ram region at or above the + * PCI-hole base (the bottom of the 4 GiB hole == low for the classic layout). Returns 1 + * and sets *base when found, 0 otherwise. Blackhole holes below 0x80000000 are skipped + * by the lower cutoff. */ +static int pci_hole_start(const char* body, const char* end, uint64_t* base) { + const char* p = body; + while (p < end) { + const char* nl = memchr(p, '\n', (size_t)(end - p)); + const char* line_end = nl ? nl : end; + region_line r; + if (parse_region_line(p, line_end, &r) && + !r.is_ram && r.start_gpa >= PCI_HOLE_BASE && r.start_gpa < RAM_HIGH_BASE) { + *base = r.start_gpa; + return 1; + } + if (!nl) break; + p = nl + 1; + } + return 0; +} + +uint64_t mtree_low_split(const char* text) { + if (!text) return 0; + + const char* body_end = NULL; + const char* body = find_system_flatview(text, &body_end); + if (!body) return 0; /* no system AS => fail-closed */ + + uint64_t off = 0, base = 0; + int have_off = high_ram_offset(body, body_end, &off); + int have_base = pci_hole_start(body, body_end, &base); + + if (have_off) { + if (off == 0 || off == ~0ull) return 0; /* degenerate offset */ + /* cross-validate against the PCI-hole base when we have one */ + if (have_base && base != off) return 0; /* layout anomaly => fail-closed */ + return off; /* primary signal */ + } + + /* No high-RAM (guest RAM all below 4 GiB): fall back to the PCI-hole base, but only + * at or above the standard base so blackhole holes can never be mistaken for it. */ + if (have_base && base >= PCI_HOLE_BASE) return base; + + return 0; /* nothing trustworthy */ +} diff --git a/src/test/fixtures/mtree_split_fragmented.txt b/src/test/fixtures/mtree_split_fragmented.txt new file mode 100644 index 0000000..d5e6e9a --- /dev/null +++ b/src/test/fixtures/mtree_split_fragmented.txt @@ -0,0 +1,42 @@ +FlatView #0 + AS "cpu-smm-0", root: mem-container-smram + Root memory region: mem-container-smram + 0000000000000000-0000000000017fff (prio 0, ram): ram0 + 0000000000018000-0000000000018fff (prio 0, ram): synic-0-msg-page + 000000000001c000-000000007fffffff (prio 0, ram): ram0 @000000000001c000 + 0000000080000000-0000000081ffffff (prio 0, i/o): vfio-pci-bar3 + 0000000100000000-000000017fffffff (prio 0, ram): ram0 @0000000040000000 + +FlatView #1 + AS "I/O", root: io + Root memory region: io + 0000000000000000-0000000000000007 (prio 0, i/o): dma-chan + 0000000000000060-0000000000000060 (prio 0, i/o): i8042-data + 0000000000000064-0000000000000064 (prio 0, i/o): i8042-cmd + +FlatView #2 + AS "memory", root: system + AS "cpu-memory-0", root: system + Root memory region: system + 0000000000000000-0000000000017fff (prio 0, ram): ram0 + 0000000000018000-0000000000018fff (prio 0, ram): synic-0-msg-page + 0000000000019000-0000000000019fff (prio 0, ram): synic-1-msg-page + 000000000001a000-000000000001afff (prio 0, ram): synic-2-msg-page + 000000000001b000-000000000001bfff (prio 0, ram): synic-3-msg-page + 000000000001c000-000000000002ffff (prio 0, ram): ram0 @000000000001c000 + 0000000000030000-000000000004ffff (prio 1, i/o): smbase-blackhole + 0000000000050000-00000000000bffff (prio 0, ram): ram0 @0000000000050000 + 00000000000c0000-00000000000dffff (prio 1, rom): pc.rom + 00000000000e0000-00000000000fffff (prio 0, rom): system.flash0 @000000000035c000 + 0000000000100000-000000007bffffff (prio 0, ram): ram0 @0000000000100000 + 000000007c000000-000000007fffffff (prio 1, i/o): tseg-blackhole + 0000000080000000-0000000081ffffff (prio 0, i/o): vfio-pci-bar3 + 0000000082000000-0000000082087fff (prio 0, i/o): vfio-pci-bar0 + 00000000e0000000-00000000efffffff (prio 0, i/o): pcie-mmcfg-mmio + 00000000fec00000-00000000fec00fff (prio 0, i/o): kvm-ioapic + 00000000ffc00000-00000000ffc83fff (prio 0, romd): system.flash1 + 0000000100000000-000000027fffffff (prio 0, ram): ram0 @0000000080000000 + +FlatView #3 + AS "pci_bridge_io", root: pci_bridge_io + Root memory region: pci_bridge_io diff --git a/src/test/test_mtree.c b/src/test/test_mtree.c new file mode 100644 index 0000000..c69f20d --- /dev/null +++ b/src/test/test_mtree.c @@ -0,0 +1,83 @@ +/* test_mtree.c — unit tests for mtree_low_split (the below-4G split parser). Pure text in, + * number out; no QMP/transport. The fragmented fixture reproduces the structural traps the + * old heuristic tripped on (Hyper-V synic overlays, smbase/tseg blackhole holes, rom holes) + * plus a decoy non-system flatview that carries its OWN GPA-0 stub and a DIFFERENT @offset, + * proving the system address space is selected (not "first match in the text"). */ +#define _GNU_SOURCE +#include "mtree.h" +#include +#include +#include + +#ifndef FIXTURE_DIR +#define FIXTURE_DIR "." +#endif + +static int g_fail = 0; +#define CHECK(cond, msg) do { if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } } while (0) + +/* Slurp a whole text file into a heap buffer (NUL-terminated). NULL on error. */ +static char* slurp(const char* path) { + FILE* f = fopen(path, "rb"); + if (!f) return NULL; + if (fseek(f, 0, SEEK_END) != 0) { fclose(f); return NULL; } + long sz = ftell(f); + if (sz < 0) { fclose(f); return NULL; } + rewind(f); + char* buf = malloc((size_t)sz + 1); + if (!buf) { fclose(f); return NULL; } + size_t got = fread(buf, 1, (size_t)sz, f); + fclose(f); + buf[got] = 0; + return buf; +} + +/* Case B: a minimal, NON-fragmented system flatview — one big GPA-0 ram run plus high-RAM + * carrying @. Must not be broken by the new parser. */ +static const char* k_happy = + "FlatView #0\n" + " AS \"memory\", root: system\n" + " Root memory region: system\n" + " 0000000000000000-000000007fffffff (prio 0, ram): ram0\n" + " 0000000080000000-0000000081ffffff (prio 0, i/o): vfio-pci-bar3\n" + " 0000000100000000-000000017fffffff (prio 0, ram): ram0 @0000000080000000\n"; + +/* Case C: text without any system flatview => fail-closed. */ +static const char* k_no_system = + "FlatView #0\n" + " AS \"I/O\", root: io\n" + " Root memory region: io\n" + " 0000000000000000-0000000000000007 (prio 0, i/o): dma-chan\n"; + +int main(void) { + printf("test_mtree\n"); + + /* Cases A and E: the fragmented fixture (decoy first, system second). */ + char path[1024]; + snprintf(path, sizeof path, "%s/mtree_split_fragmented.txt", FIXTURE_DIR); + char* frag = slurp(path); + CHECK(frag != NULL, "fragmented fixture loaded"); + if (frag) { + uint64_t low = mtree_low_split(frag); + /* A: fragmented low-RAM must NOT yield the GPA-0 stub end (0x18000) — the bug. */ + CHECK(low == 0x80000000ull, "A: fragmented split == 0x80000000"); + CHECK(low != 0x18000ull, "A: not the GPA-0 stub end (0x18000)"); + /* E: the decoy (non-system) flatview comes FIRST and carries @0x40000000; the + * function must select the SYSTEM flatview (@0x80000000), not the decoy. */ + CHECK(low != 0x40000000ull, "E: decoy flatview @offset rejected (system AS chosen)"); + free(frag); + } + + /* Case B: happy path (non-fragmented) still resolves to the high-RAM @offset. */ + CHECK(mtree_low_split(k_happy) == 0x80000000ull, "B: non-fragmented happy path == 0x80000000"); + + /* Case C: no system flatview => 0. */ + CHECK(mtree_low_split(k_no_system) == 0, "C: no system flatview => fail-closed 0"); + + /* Case D: garbage / empty => 0. */ + CHECK(mtree_low_split("") == 0, "D: empty text => 0"); + CHECK(mtree_low_split("not an mtree at all\n") == 0, "D: junk text => 0"); + + printf("mtree tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +}