fix(memctx): retry cold bootstrap with backoff while guest boots

The cold address-space bootstrap (host_bootstrap -> System DTB) ran once and was
terminal: when the adapter attached before the guest finished booting, no System
process was found, the adapter emitted a single ERROR and never retried, so the
memctx datum was never published.

Make it self-healing: on bootstrap failure arm a one-shot backoff timerfd (a
second adapter fd, demuxed by cookie) that re-kicks the bootstrap until it
succeeds; reset and disarm on success. Drop the per-failure URGENT ERROR (a
still-booting guest is transient, not a fault) for a single diagnostic line on
the first failure. Add a stub fail-injection (cfg fail_boots) and test_retry.

Bump 0.3.5.
This commit is contained in:
2026-06-24 11:20:32 +03:00
parent 6fea392d46
commit 26e5ab4709
4 changed files with 156 additions and 15 deletions
+37 -1
View File
@@ -42,7 +42,7 @@ struct holder {
holder* peer; /* multi-VM: stop when both are ready (or NULL) */
int is_driver; /* stops the loop on a condition */
uint32_t expect_ep;
int memctx, invalidated, ticks, bad_ep;
int memctx, invalidated, ticks, bad_ep, errors;
uint64_t last_kcr3, kcr3_e0;
uint32_t last_epoch, last_nseg;
int ro_ok, rw_eacces, seg0_ok;
@@ -66,6 +66,7 @@ static int h_on_ev(void* u, const vmsig_event* ev) {
holder* h = u;
if (ev->kind == VMSIG_EV_VM_LIFECYCLE) h->ticks++;
else if (ev->kind == VMSIG_EV_MEMCTX_INVALIDATED) h->invalidated++;
else if (ev->kind == VMSIG_EV_ERROR) h->errors++; /* no boot-retry ERROR spam */
maybe_stop(h);
return 0;
}
@@ -393,6 +394,40 @@ static void test_ro_fd_ownership(void) {
if (fcntl(ro, F_GETFD) >= 0) close(ro); /* belt-and-braces if the assert failed */
}
/* ---- 7. cold-bootstrap retry: stub fails N times, then publishes via backoff ----- *
* Regression for the cold-bootstrap-while-guest-boots bug: a failed bootstrap must NOT be
* terminal nor emit URGENT ERROR — it arms a one-shot backoff timerfd that re-kicks the
* bootstrap until it succeeds. fail_boots=3 makes the first three stub bootstraps fail
* deterministically (no timing dependence); the real timerfd fires at ~50/100/200ms, so the
* 4th kick succeeds sub-second. vmhost is added (as test_multicast) for the ticks failsafe
* and a realistic loop; stop on memctx>=1 (stop_epoch=-1). */
static void test_retry(void) {
printf("test_retry\n");
vmsig_ctx* ctx = vmsig_ctx_new();
vmsig_core* core = vmsig_core_new(ctx);
holder h; memset(&h, 0, sizeof h);
h.core = core; h.is_driver = 1; h.expect_ep = 0; h.stop_epoch = -1;
/* OBSERVE so vmhost lifecycle ticks reach maybe_stop (ticks>30 failsafe) and ERROR
* (if any) is counted; MEMCTX cap to receive the published context. */
add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0);
CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)");
vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc);
mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = -1; mc.fail_boots = 3;
CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx (fail_boots=3)");
vmsig_core_run(core);
CHECK(h.memctx >= 1, "MEMCTX published after a series of bootstrap failures (retry worked)");
CHECK(h.last_kcr3 != 0, "valid kcr3 after the successful retry");
CHECK(h.errors == 0, "no ERROR spam during boot retries");
vmsig_core_free(core);
vmsig_ctx_free(ctx);
}
int main(void) {
test_multicast();
test_epoch();
@@ -400,6 +435,7 @@ int main(void) {
test_multivm();
test_socket();
test_ro_fd_ownership();
test_retry();
printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS");
return g_fail ? 1 : 0;
}