diff --git a/CMakeLists.txt b/CMakeLists.txt index 8509647..6bc1a1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.16) # Single source of truth for the version: CI passes -DVMSIG_VERSION=${TAG#v}, so the project # version (-> libvgpu-perception SONAME/.so version) and the .deb version come from one tag. -set(VMSIG_VERSION "0.3.4" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") +set(VMSIG_VERSION "0.3.5" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") project(vmsig VERSION ${VMSIG_VERSION} LANGUAGES C) set(CMAKE_C_STANDARD 17) diff --git a/src/adapter/memctx/include/memctx.h b/src/adapter/memctx/include/memctx.h index 588bfb2..3c94859 100644 --- a/src/adapter/memctx/include/memctx.h +++ b/src/adapter/memctx/include/memctx.h @@ -12,6 +12,9 @@ typedef struct { /* TRANSFERS to the adapter (closed in close()) — the */ /* caller dups first if it must keep its own copy. */ /* <0 => default: open(ram_path, O_RDONLY) / stub-memfd */ + uint32_t fail_boots; /* test-only: fail the first N stub bootstraps before */ + /* succeeding (drives the retry/backoff path deterministically */ + /* without timing dependence); 0 in production. stub path only. */ } vmsig_memctx_cfg; /* Max SRC bytes per atomic gva_write (bounds the worker POD slot; mc_req header + src diff --git a/src/adapter/memctx/memctx.c b/src/adapter/memctx/memctx.c index 250a267..efa154f 100644 --- a/src/adapter/memctx/memctx.c +++ b/src/adapter/memctx/memctx.c @@ -25,6 +25,7 @@ #include #include #include +#include /* one-shot backoff timer for cold-bootstrap retry */ #ifdef VMSIG_WITH_VMIE #include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */ @@ -54,6 +55,18 @@ static int memfd_create(const char* name, unsigned int flags) { #define MC_MAX_SEG 8 #define MC_WORKER_DEPTH 16 /* one off-loop thread: rare bootstrap + writes */ +/* Cold-bootstrap retry backoff (guest may still be booting when discovery attaches us; + * host_bootstrap then finds no System process). Mirror of the discovery backoff so the + * adapter stays decoupled from the discovery layer (Rule-of-three not reached): 50ms base, + * exponential with the shift capped at 6, ceiling 2s steady-state. One-shot timerfd: armed + * on failure, disarmed on success — no it_interval, no busy-wait. */ +#define MC_BOOT_BACKOFF_BASE 50000000ull /* 50 ms */ +#define MC_BOOT_BACKOFF_CAP 2000000000ull /* 2 s */ + +/* Adapter readiness fds are demuxed by per-slot cookie: slot 0 is the worker completion + * eventfd, slot 1 is the one-shot backoff timerfd that re-kicks the bootstrap. */ +enum { MC_COOKIE_WORKER = 0, MC_COOKIE_RETRY = 1 }; + enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 }; /* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold @@ -63,7 +76,10 @@ enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 }; * target cr3 (0 => System DTB; resolved on the worker against a->kcr3). */ typedef struct { uint32_t op; /* MC_JOB_* */ - uint32_t boot_count; /* MC_JOB_BOOTSTRAP */ + uint32_t boot_count; /* MC_JOB_BOOTSTRAP: drives the stub kcr3 per epoch */ + uint32_t attempt; /* MC_JOB_BOOTSTRAP: consecutive-failure index of THIS */ + /* kick (copy of a->boot_attempts); stub fails while */ + /* attempt < a->fail_boots. NOT the epoch counter. */ /* --- MC_JOB_WRITE --- */ uint64_t cr3; /* target AS root; 0 => a->kcr3 (kernel AS), resolved on worker */ uint64_t gva; @@ -89,7 +105,12 @@ struct vmsig_adapter { vmsig_emit emit; int registered; /* register_memctx already called */ vmsig_worker* worker; /* off-loop bootstrap + atomic writes */ - uint32_t boot_count; /* incremented on each (re-)bootstrap */ + uint32_t boot_count; /* incremented on each (re-)bootstrap (epoch tag) */ + + /* cold-bootstrap retry — loop-thread-only (attach/on_ready/invalidate/close). */ + int retry_fd; /* one-shot backoff timerfd (-1 when none) */ + uint32_t boot_attempts; /* consecutive bootstrap failures this cycle (0 = none); reset on success/epoch */ + uint32_t fail_boots; /* test-only: fail the first N stub bootstraps (cfg); set once in mc_open, then read-only (worker reads it) */ #ifdef VMSIG_WITH_VMIE vmie_win32* win; /* held RW handle across the epoch (kcr3 source + gva_write target) */ @@ -109,6 +130,35 @@ struct vmsig_adapter { /* fwd: MEMWRITE completion ACK (defined below mc_submit; used in mc_on_ready demux). */ static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin); +/* mirror of the discovery backoff; kept in this adapter to stay decoupled from the discovery + * layer (Rule-of-three not reached). Exponential with a shift capped at 6, clamped to CAP. */ +static uint64_t mc_boot_backoff(uint32_t attempts) { + uint64_t b = MC_BOOT_BACKOFF_BASE << (attempts < 6 ? attempts : 6); + return b > MC_BOOT_BACKOFF_CAP ? MC_BOOT_BACKOFF_CAP : b; +} + +/* Arm the one-shot backoff timer (it_value only — no it_interval). Loop-thread-only. + * Best-effort: a settime failure is logged, not fatal (matches discovery rearm). */ +static void mc_arm_retry(struct vmsig_adapter* a) { + if (a->retry_fd < 0) return; + uint64_t dt = mc_boot_backoff(a->boot_attempts); + struct itimerspec its; + memset(&its, 0, sizeof its); + its.it_value.tv_sec = (time_t)(dt / 1000000000ull); + its.it_value.tv_nsec = (long)(dt % 1000000000ull); + if (timerfd_settime(a->retry_fd, 0, &its, NULL) != 0) + fprintf(stderr, "vmsig memctx: endpoint %u retry timer arm failed\n", a->endpoint); +} + +/* Disarm the backoff timer (zero itimerspec). Loop-thread-only. Used on bootstrap success + * and at epoch change so a stale arm from a prior failure cannot fire over a fresh cycle. */ +static void mc_disarm_retry(struct vmsig_adapter* a) { + if (a->retry_fd < 0) return; + struct itimerspec its; + memset(&its, 0, sizeof its); + (void)timerfd_settime(a->retry_fd, 0, &its, NULL); +} + /* ---- stub RO-fd: memfd + deterministic contents + seal of future writes ---- */ static int mc_make_stub_fd(uint32_t size) { int fd = memfd_create("vmsig_memctx", MFD_CLOEXEC | MFD_ALLOW_SEALING); @@ -184,6 +234,9 @@ static int mc_job(void* user, const void* req, void* res) { /* MC_JOB_BOOTSTRAP */ if (a->stub) { + /* test-only: fail the first fail_boots attempts to exercise the retry path + * deterministically (a->fail_boots is set once in open, read-only here). */ + if (rq->attempt < a->fail_boots) return -1; rs->kcr3 = 0xC0DE0000ull + (uint64_t)rq->boot_count * 0x1000ull; /* changes per epoch */ return 0; } @@ -202,6 +255,7 @@ static void mc_kick_bootstrap(struct vmsig_adapter* a) { mc_req rq; memset(&rq, 0, sizeof rq); rq.op = MC_JOB_BOOTSTRAP; rq.boot_count = a->boot_count; + rq.attempt = a->boot_attempts; /* failure index of this kick (loop-thread snapshot) */ (void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */ } @@ -232,6 +286,10 @@ static void mc_reg_invalidate(void* ctx, uint32_t epoch) { struct vmsig_adapter* a = ctx; (void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */ a->have_ctx = 0; /* the previous context is invalid */ + /* new cycle: drop a stale arm from the previous cycle and restart the failure counter at + * zero so this bootstrap's backoff starts fresh (and the first-failure diagnostic re-arms). */ + a->boot_attempts = 0; + mc_disarm_retry(a); mc_kick_bootstrap(a); /* off-loop; on_ready re-emits MEMCTX (new epoch) */ } @@ -247,11 +305,13 @@ static vmsig_adapter* mc_open(const void* cfg, uint32_t endpoint) { a->cfg_ro_fd = (c && c->ro_fd >= 0) ? c->ro_fd : -1; if (!a->ram_path && a->cfg_ro_fd < 0) a->stub = 1; /* no path/fd => stub */ a->stub_fd = -1; + a->retry_fd = -1; + a->fail_boots = c ? c->fail_boots : 0; /* set once; read-only afterwards (worker reads) */ return a; } static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg, int cap) { - if (cap < 1) return -1; + if (cap < 2) return -1; /* worker eventfd + one-shot backoff timerfd */ a->emit = *emit; a->worker = vmsig_worker_new(mc_job, a, 1, MC_WORKER_DEPTH); @@ -262,11 +322,27 @@ static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg if (a->stub_fd < 0) { vmsig_worker_free(a->worker); a->worker = NULL; return -1; } } - /* worker completion-eventfd as the readiness source (cookie=0). */ + /* one-shot backoff timerfd: re-kicks the cold bootstrap when the guest is still booting. + * Created here (loop-thread-only fd); armed on failure, disarmed on success. Rollback the + * worker + stub_fd on failure, symmetric to mc_make_stub_fd above. */ + a->retry_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); + if (a->retry_fd < 0) { + if (a->stub_fd >= 0) { close(a->stub_fd); a->stub_fd = -1; } + vmsig_worker_free(a->worker); a->worker = NULL; + return -1; + } + + /* worker completion-eventfd as the readiness source (cookie=worker). */ reg[0].fd = vmsig_worker_evfd(a->worker); reg[0].epoll_events = EPOLLIN; reg[0].shape = VMSIG_RDY_EVENTFD; - reg[0].cookie = 0; + reg[0].cookie = MC_COOKIE_WORKER; + + /* backoff timerfd as the second readiness source (cookie=retry). */ + reg[1].fd = a->retry_fd; + reg[1].epoll_events = EPOLLIN; + reg[1].shape = VMSIG_RDY_TIMERFD; + reg[1].cookie = MC_COOKIE_RETRY; /* register the reg BEFORE the first bootstrap: the core slot gets the hooks. describe * is not called until the slot is valid (which only happens after the first MEMCTX). */ @@ -289,11 +365,23 @@ static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg a->emit.emit(a->emit.token, &up); mc_kick_bootstrap(a); /* first bootstrap off-loop; assemble the locator on completion */ - return 1; + return 2; /* worker eventfd + backoff timerfd */ } static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { - (void)cookie; (void)events; + (void)events; /* epoll flags carry nothing we need; the cookie selects the source */ + + /* retry timerfd fired: the guest was still booting; drain and re-kick the bootstrap. + * Re-kick is a fresh MC_JOB_BOOTSTRAP into the SAME FIFO worker queue, so it serializes + * behind any in-flight write — nothing extra to synchronize. */ + if (cookie == MC_COOKIE_RETRY) { + uint64_t v; + while (read(a->retry_fd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain to EAGAIN */ } + mc_kick_bootstrap(a); + return 0; + } + + /* cookie == MC_COOKIE_WORKER: worker completion. */ vmsig_worker_ack(a->worker); mc_res rs; int rc; @@ -304,18 +392,29 @@ static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { continue; } if (rc != 0) { - /* bootstrap failed: ERROR (source MEMCTX); do NOT publish an invalid kcr3. */ - vmsig_event er; - memset(&er, 0, sizeof er); - er.kind = VMSIG_EV_ERROR; er.source = VMSIG_SRC_MEMCTX; er.dir = VMSIG_DIR_UP; - er.prio = VMSIG_PRIO_URGENT; er.endpoint = a->endpoint; - a->emit.emit(a->emit.token, &er); + /* bootstrap failed: the guest is likely still booting (host_bootstrap found no + * System process). This is NOT a control-level error — do NOT emit VMSIG_EV_ERROR + * (it would spam URGENT during a normal multi-second guest boot). Instead schedule a + * backoff retry; the context simply stays unpublished until a kick succeeds. One + * diagnostic line on the FIRST failure of the cycle (symmetric to the discovery + * "never came up" note), not on every attempt. */ + if (a->boot_attempts == 0) + fprintf(stderr, "vmsig memctx: endpoint %u bootstrap not ready yet, retrying\n", + a->endpoint); + a->boot_attempts++; + mc_arm_retry(a); /* one-shot timer at mc_boot_backoff(boot_attempts) */ continue; } /* assemble the locator on the loop thread from rs.kcr3. a->kcr3 is the gva_write * TARGET and is owned SOLELY by the worker thread (set in mc_bootstrap_armed, read by * MC_JOB_WRITE — same thread, FIFO happens-before); the loop must NOT also write it, or * an in-flight write at line ~170 would race it. cur_pod.kcr3 is loop-only (delivery). */ + + /* bootstrap succeeded: cancel any pending retry and reset the failure counter BEFORE + * publishing, so a stale timer armed by a prior failure cannot fire over a live context. */ + a->boot_attempts = 0; + mc_disarm_retry(a); + memset(&a->cur_pod, 0, sizeof a->cur_pod); a->cur_pod.kcr3 = rs.kcr3; a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE; @@ -398,6 +497,9 @@ static void mc_close(vmsig_adapter* a) { if (a->win) vmie_win32_close(a->win); /* AFTER worker join: no in-flight gva_write */ #endif if (a->stub_fd >= 0) close(a->stub_fd); + /* one-shot backoff timerfd: never spawns a worker job, so its close is independent of the + * worker join — same contract as stub_fd. The core already epoll_ctl(DEL)'d the slot. */ + if (a->retry_fd >= 0) close(a->retry_fd); /* ro_fd ownership transferred to the adapter at open(): close it here so a re-grant * (detach + re-attach with a fresh infra ro_fd) does not leak the prior one. Infra * that must keep its own copy dups before handing it in — symmetric to the holder diff --git a/src/test/test_memctx.c b/src/test/test_memctx.c index 116f992..b0a6601 100644 --- a/src/test/test_memctx.c +++ b/src/test/test_memctx.c @@ -42,7 +42,7 @@ struct holder { holder* peer; /* multi-VM: stop when both are ready (or NULL) */ int is_driver; /* stops the loop on a condition */ uint32_t expect_ep; - int memctx, invalidated, ticks, bad_ep; + int memctx, invalidated, ticks, bad_ep, errors; uint64_t last_kcr3, kcr3_e0; uint32_t last_epoch, last_nseg; int ro_ok, rw_eacces, seg0_ok; @@ -66,6 +66,7 @@ static int h_on_ev(void* u, const vmsig_event* ev) { holder* h = u; if (ev->kind == VMSIG_EV_VM_LIFECYCLE) h->ticks++; else if (ev->kind == VMSIG_EV_MEMCTX_INVALIDATED) h->invalidated++; + else if (ev->kind == VMSIG_EV_ERROR) h->errors++; /* no boot-retry ERROR spam */ maybe_stop(h); return 0; } @@ -393,6 +394,40 @@ static void test_ro_fd_ownership(void) { if (fcntl(ro, F_GETFD) >= 0) close(ro); /* belt-and-braces if the assert failed */ } +/* ---- 7. cold-bootstrap retry: stub fails N times, then publishes via backoff ----- * + * Regression for the cold-bootstrap-while-guest-boots bug: a failed bootstrap must NOT be + * terminal nor emit URGENT ERROR — it arms a one-shot backoff timerfd that re-kicks the + * bootstrap until it succeeds. fail_boots=3 makes the first three stub bootstraps fail + * deterministically (no timing dependence); the real timerfd fires at ~50/100/200ms, so the + * 4th kick succeeds sub-second. vmhost is added (as test_multicast) for the ticks failsafe + * and a realistic loop; stop on memctx>=1 (stop_epoch=-1). */ +static void test_retry(void) { + printf("test_retry\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder h; memset(&h, 0, sizeof h); + h.core = core; h.is_driver = 1; h.expect_ep = 0; h.stop_epoch = -1; + /* OBSERVE so vmhost lifecycle ticks reach maybe_stop (ticks>30 failsafe) and ERROR + * (if any) is counted; MEMCTX cap to receive the published context. */ + add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)"); + + vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); + mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = -1; mc.fail_boots = 3; + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx (fail_boots=3)"); + + vmsig_core_run(core); + + CHECK(h.memctx >= 1, "MEMCTX published after a series of bootstrap failures (retry worked)"); + CHECK(h.last_kcr3 != 0, "valid kcr3 after the successful retry"); + CHECK(h.errors == 0, "no ERROR spam during boot retries"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + int main(void) { test_multicast(); test_epoch(); @@ -400,6 +435,7 @@ int main(void) { test_multivm(); test_socket(); test_ro_fd_ownership(); + test_retry(); printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS"); return g_fail ? 1 : 0; }