diff --git a/CMakeLists.txt b/CMakeLists.txt index fd84edb..fc31095 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.16) # Single source of truth for the version: CI passes -DVMSIG_VERSION=${TAG#v}, so the project # version (-> libvgpu-perception SONAME/.so version) and the .deb version come from one tag. -set(VMSIG_VERSION "0.3.11" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") +set(VMSIG_VERSION "0.3.12" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") project(vmsig VERSION ${VMSIG_VERSION} LANGUAGES C) set(CMAKE_C_STANDARD 17) diff --git a/src/adapter/memctx/include/memctx.h b/src/adapter/memctx/include/memctx.h index 3c94859..f020af0 100644 --- a/src/adapter/memctx/include/memctx.h +++ b/src/adapter/memctx/include/memctx.h @@ -15,6 +15,13 @@ typedef struct { uint32_t fail_boots; /* test-only: fail the first N stub bootstraps before */ /* succeeding (drives the retry/backoff path deterministically */ /* without timing dependence); 0 in production. stub path only. */ + const char* persist_path; /* armed: path to the kcr3 cache file (sibling of .slots in the */ + /* watch dir, tmpfs-local: survives a daemon restart, dies with the */ + /* RAM file on host reboot). NULL/empty => persist disabled (cold */ + /* bootstrap only). The boot-session discriminator is the kcr3 */ + /* itself: on resume it is validated against live RAM via */ + /* vmie_win32_open_ro_fd (NULL if it no longer resolves the kernel) */ + /* — a stale kcr3 after a guest reboot is rejected, fail-closed. */ } vmsig_memctx_cfg; /* Max SRC bytes per atomic gva_write (bounds the worker POD slot; mc_req header + src diff --git a/src/adapter/memctx/memctx.c b/src/adapter/memctx/memctx.c index efa154f..8a138c7 100644 --- a/src/adapter/memctx/memctx.c +++ b/src/adapter/memctx/memctx.c @@ -26,6 +26,7 @@ #include #include #include /* one-shot backoff timer for cold-bootstrap retry */ +#include /* persist file mode bits (0600) */ #ifdef VMSIG_WITH_VMIE #include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */ @@ -67,7 +68,82 @@ static int memfd_create(const char* name, unsigned int flags) { * eventfd, slot 1 is the one-shot backoff timerfd that re-kicks the bootstrap. */ enum { MC_COOKIE_WORKER = 0, MC_COOKIE_RETRY = 1 }; -enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 }; +/* MC_JOB_RESUME: fast-path boot-session re-validation. On a daemon restart the cold scan + * (host_bootstrap) is slow AND unstable (it hunts the agent beacon across physical RAM); if + * the guest did NOT reboot, its System DTB (kcr3) is unchanged and was cached at the last + * live scan. RESUME re-opens an O_RDONLY context with that cached kcr3 (vmie_win32_open_ro_fd, + * which bypasses the beacon scan) — the boot-session discriminator is the kcr3 ITSELF against + * the live RAM: it resolves the kernel (ntoskrnl) only if the guest is the same boot. */ +enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1, MC_JOB_RESUME = 2 }; + +/* ---- kcr3 context persist: a cache of the cold-bootstrap result, mirror of the .slots + * idiom in src/discovery/slot.c (magic+version POD, native byte order, atomic tmp+rename, + * fail-soft load). Deliberately NOT factored into a shared helper: discovery (vmid<->slot) + * and this adapter (kcr3 cache) are different layers with different lifecycles — Rule-of-three + * is not reached, and a shared helper would couple the two prematurely. + * + * We persist the MINIMUM: only {magic, version, kcr3}. NO RAM metadata (st_ino/size/mtime/ + * btime): those do NOT prove the RAM holds the same boot session (the backing file outlives a + * memory overwrite, the inode can be reused). The boot-session discriminator is the kcr3 + * self-validating against the live RAM at load time (see MC_JOB_RESUME), not file metadata. + * + * MEMWRITE-target safety: a persisted kcr3 is a READ locator only. The write target (a->kcr3) + * is set ONLY by the bootstrap worker after a fresh live scan — never from this file. */ +#define MC_PERSIST_MAGIC 0x4B435258u /* "KCRX" — kcr3 context cache */ +#define MC_PERSIST_VERSION 1u +typedef struct { + uint32_t magic; + uint32_t version; + uint64_t kcr3; /* System DTB obtained from a live RAM scan; validated by open_ro_fd */ +} mc_persist_blob; + +/* Atomic save: write a temp sibling then rename over the target, so a reader (or a racing + * second daemon) sees either the whole old file or the whole new one. Loop-thread-only. + * Returns 0 on success, -1 otherwise (best-effort: the datum is already published). */ +static int mc_persist_save(const char* path, uint64_t kcr3) { + if (!path || !*path) return -1; + mc_persist_blob b; + memset(&b, 0, sizeof b); + b.magic = MC_PERSIST_MAGIC; b.version = MC_PERSIST_VERSION; b.kcr3 = kcr3; + + char tmp[512]; + int n = snprintf(tmp, sizeof tmp, "%s.tmp", path); + if (n < 0 || (size_t)n >= sizeof tmp) return -1; + + int fd = open(tmp, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0600); + if (fd < 0) return -1; + ssize_t w = write(fd, &b, sizeof b); + int rc = (w == (ssize_t)sizeof b) ? 0 : -1; + if (close(fd) != 0) rc = -1; + if (rc == 0 && rename(tmp, path) != 0) rc = -1; + if (rc != 0) unlink(tmp); + return rc; +} + +/* Load + validate the POD header. Loop-thread-only. Returns 1 if a well-formed blob was read + * (out filled), 0 otherwise (no file / short / wrong magic or version => fail-soft, fall back + * to a cold bootstrap). No migrations: an old version is ignored and overwritten by the next + * live scan result. NOTE: this validates only the file SHAPE; the kcr3 itself is validated + * against live RAM on the worker (MC_JOB_RESUME), which is the real boot-session discriminator. */ +static int mc_persist_load(const char* path, mc_persist_blob* out) { + if (!path || !*path) return 0; + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) return 0; /* no file => cold bootstrap */ + mc_persist_blob b; + ssize_t r = read(fd, &b, sizeof b); + close(fd); + if (r != (ssize_t)sizeof b || b.magic != MC_PERSIST_MAGIC || b.version != MC_PERSIST_VERSION) + return 0; /* corrupt/old => cold bootstrap */ + *out = b; + return 1; +} + +/* Drop the cache on a destructive VM-lifecycle (the RAM may have changed). Best-effort. + * Hygiene only: even without the drop a stale kcr3 would be rejected by the self-validation, + * but we do not leave a known-dead file around. Loop-thread-only. */ +static void mc_persist_drop(const char* path) { + if (path && *path) unlink(path); +} /* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold * bootstrap and the atomic writes (FIFO serializes a write against the close-on-rebootstrap). @@ -80,8 +156,9 @@ typedef struct { uint32_t attempt; /* MC_JOB_BOOTSTRAP: consecutive-failure index of THIS */ /* kick (copy of a->boot_attempts); stub fails while */ /* attempt < a->fail_boots. NOT the epoch counter. */ - /* --- MC_JOB_WRITE --- */ - uint64_t cr3; /* target AS root; 0 => a->kcr3 (kernel AS), resolved on worker */ + /* --- MC_JOB_WRITE / MC_JOB_RESUME --- */ + uint64_t cr3; /* WRITE: target AS root (0 => a->kcr3); RESUME: persisted kcr3 to validate */ + uint64_t low; /* MC_JOB_RESUME: below-4G split for vmie_win32_open_ro_fd (ignored by others) */ uint64_t gva; uint32_t len; uint32_t corr; @@ -100,6 +177,7 @@ struct vmsig_adapter { uint32_t endpoint; int stub; const char* ram_path; /* armed: RAM-backing path (NOT published outward) */ + const char* persist_path; /* armed: kcr3 cache file path (cfg, loop-thread-only); NULL => persist off */ uint64_t low; int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (owned by adapter, closed in mc_close); <0 => default */ vmsig_emit emit; @@ -232,6 +310,49 @@ static int mc_job(void* user, const void* req, void* res) { #endif } + if (rq->op == MC_JOB_RESUME) { + /* Fast-path boot-session re-validation: open an O_RDONLY context with the PERSISTED + * kcr3 and let the engine decide if it still resolves the kernel in the LIVE RAM. + * This is purely a READ validation — it NEVER touches a->win/a->mem/a->kcr3 (the + * RW write-hold, owned by the bootstrap worker after a fresh live scan). MEMWRITE- + * target safety: a persisted kcr3 must never become the gva_write target. */ + if (a->stub) { + /* No VMIE here, so there is no real RAM to validate against: synthetically ACCEPT a + * nonzero kcr3 so the stub can exercise the persist MECHANICS (save/load/fast-vs-slow + * selection). This is NOT real boot-session validation — that is armed-only. */ + if (rq->cr3 == 0) return -1; + rs->kcr3 = rq->cr3; + return 0; + } +#ifdef VMSIG_WITH_VMIE + /* fresh O_RDONLY fd over the backing (same source as mc_reg_share_fd: dup the infra + * RO-fd, else open ram_path O_RDONLY). The RO context borrows it (dup'd internally), + * so we close our copy after open. */ + int rfd; + if (a->cfg_ro_fd >= 0) rfd = fcntl(a->cfg_ro_fd, F_DUPFD_CLOEXEC, 0); + else if (a->ram_path) rfd = open(a->ram_path, O_RDONLY | O_CLOEXEC); + else return -1; + if (rfd < 0) return -1; + vmie_win32* v = vmie_win32_open_ro_fd(rfd, rq->low, rq->cr3); + close(rfd); /* borrowed by open_ro_fd (dup'd internally) */ + if (!v) return -1; /* kcr3 no longer resolves the kernel => stale/guest-reboot */ + /* Second, independent signal: the System process must be present AND its cr3 must equal + * the persisted kcr3 (the System DTB by definition). Catches the pathology "kcr3 resolves + * a DIFFERENT kernel". Cheap — the RO context is already built. Fail-closed on mismatch. */ + process procs[16]; + int n = proc_list(v, 0, procs, 16); + int system_ok = 0; + for (int i = 0; i < n && i < 16; i++) + if (!strcmp(procs[i].name, "System")) { system_ok = (procs[i].cr3 == rq->cr3); break; } + vmie_win32_close(v); /* validation-only: the read datum needs no held handle */ + if (!system_ok) return -1; + rs->kcr3 = rq->cr3; /* validated: publish the read datum (NOT a->kcr3) */ + return 0; +#else + return -1; /* armed without the build flag: resume impossible -> cold bootstrap */ +#endif + } + /* MC_JOB_BOOTSTRAP */ if (a->stub) { /* test-only: fail the first fail_boots attempts to exercise the retry path @@ -259,6 +380,45 @@ static void mc_kick_bootstrap(struct vmsig_adapter* a) { (void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */ } +/* Submit the fast-path RESUME (off-loop: open_ro_fd reads image pages, not on the loop thread). + * Carries the persisted kcr3 + the cfg low for vmie_win32_open_ro_fd. On miss/validation-fail the + * completion handler falls back to a cold bootstrap — the persist never replaces it. */ +static void mc_kick_resume(struct vmsig_adapter* a, uint64_t kcr3) { + mc_req rq; + memset(&rq, 0, sizeof rq); + rq.op = MC_JOB_RESUME; rq.cr3 = kcr3; rq.low = a->low; + (void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */ +} + +/* Single publication path for BOTH RESUME and BOOTSTRAP (no two ways to publish a MEMCTX). + * Assembles the single-low locator from `kcr3` + a->low, marks have_ctx, and emits the MEMCTX + * trigger; the core authoritatively re-describes and stamps the epoch. Loop-thread-only. + * + * Ownership: this writes kcr3 ONLY into cur_pod.kcr3 (the delivery copy). It does NOT touch + * a->kcr3 — that is the gva_write TARGET, owned solely by the bootstrap worker. The difference + * between the two callers is only the SOURCE of kcr3 and whether an RW-hold / persist-save + * follows; the locator assembly itself is shared here. */ +static void mc_publish_ctx(struct vmsig_adapter* a, uint64_t kcr3) { + memset(&a->cur_pod, 0, sizeof a->cur_pod); + a->cur_pod.kcr3 = kcr3; + a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE; + a->cur_pod.flags = VMSIG_MEMCTX_RDONLY; + a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */ + a->cur_segs[0].gpa = 0; + a->cur_segs[0].len = a->cur_pod.low; + a->cur_segs[0].file_off = 0; + a->cur_pod.nseg = a->cur_nseg; + a->have_ctx = 1; + + /* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */ + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; + memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod); + a->emit.emit(a->emit.token, &up); +} + /* ---- reg hooks (vmsig_memctx_reg.ctx = a; called by the core on the loop thread) ---- */ static void mc_reg_describe(void* ctx, vmsig_memctx* out_pod, const vmsig_memseg** out_segs, uint32_t* out_nseg) { @@ -286,6 +446,10 @@ static void mc_reg_invalidate(void* ctx, uint32_t epoch) { struct vmsig_adapter* a = ctx; (void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */ a->have_ctx = 0; /* the previous context is invalid */ + /* destructive VM-lifecycle => the RAM may have changed => drop the kcr3 cache so the next + * restart cannot fast-path off a now-dead kcr3 (the self-validation would reject it anyway, + * but we do not leave a known-stale file). Best-effort, loop-thread-only. */ + mc_persist_drop(a->persist_path); /* new cycle: drop a stale arm from the previous cycle and restart the failure counter at * zero so this bootstrap's backoff starts fresh (and the first-failure diagnostic re-arms). */ a->boot_attempts = 0; @@ -307,6 +471,7 @@ static vmsig_adapter* mc_open(const void* cfg, uint32_t endpoint) { a->stub_fd = -1; a->retry_fd = -1; a->fail_boots = c ? c->fail_boots : 0; /* set once; read-only afterwards (worker reads) */ + a->persist_path = c ? c->persist_path : NULL; /* NULL => persist disabled (cold bootstrap only) */ return a; } @@ -364,7 +529,16 @@ static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; a->emit.emit(a->emit.token, &up); - mc_kick_bootstrap(a); /* first bootstrap off-loop; assemble the locator on completion */ + /* Fast-path: if a kcr3 cache exists, try a RESUME (re-validate it against live RAM) BEFORE + * the cold scan. On a daemon restart over an unchanged guest this publishes the read datum + * in milliseconds instead of minutes of beacon-scan retry. On any miss (persist off / stub / + * no file / corrupt) we fall straight into the existing cold bootstrap. The RW-hold for + * MEMWRITE is still acquired by a cold bootstrap (kicked in parallel after a RESUME hit). */ + mc_persist_blob b; + if (a->persist_path && *a->persist_path && mc_persist_load(a->persist_path, &b)) + mc_kick_resume(a, b.kcr3); /* validate the cached kcr3 off-loop; cold fallback on miss */ + else + mc_kick_bootstrap(a); /* first cold bootstrap off-loop; assemble locator on completion */ return 2; /* worker eventfd + backoff timerfd */ } @@ -391,6 +565,27 @@ static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { mc_memwrite_ack(a, rs.ok && rc == 0, rs.corr, rs.origin); continue; } + + if (rs.op == MC_JOB_RESUME) { + /* Fast-path completion. The persisted kcr3 was validated against the LIVE RAM on the + * worker (open_ro_fd != NULL [+ System-cr3 match]) — the read datum is safe to publish. + * Note: the worker did NOT set a->kcr3/a->win/a->mem (the RW write-hold), so MEMWRITE + * stays ok=0 until a cold bootstrap acquires it. */ + if (rc == 0) { + mc_publish_ctx(a, rs.kcr3); /* video lives instantly (read datum), epoch by core */ + mc_kick_bootstrap(a); /* in parallel: acquire the RW-hold (a->kcr3) for MEMWRITE */ + /* Do NOT save the persist (the kcr3 came FROM the file) and do NOT arm a retry + * (the read datum is up; the parallel bootstrap arms its own retry on failure). */ + } else { + /* validation miss: the persisted kcr3 no longer resolves the kernel (guest rebooted + * or corrupt). Fall back to an honest cold scan; on success it rewrites the persist + * with a fresh kcr3. Do NOT retry the RESUME — the cache is under suspicion. */ + mc_kick_bootstrap(a); + } + continue; + } + + /* MC_JOB_BOOTSTRAP */ if (rc != 0) { /* bootstrap failed: the guest is likely still booting (host_bootstrap found no * System process). This is NOT a control-level error — do NOT emit VMSIG_EV_ERROR @@ -405,34 +600,29 @@ static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { mc_arm_retry(a); /* one-shot timer at mc_boot_backoff(boot_attempts) */ continue; } - /* assemble the locator on the loop thread from rs.kcr3. a->kcr3 is the gva_write - * TARGET and is owned SOLELY by the worker thread (set in mc_bootstrap_armed, read by - * MC_JOB_WRITE — same thread, FIFO happens-before); the loop must NOT also write it, or - * an in-flight write at line ~170 would race it. cur_pod.kcr3 is loop-only (delivery). */ - - /* bootstrap succeeded: cancel any pending retry and reset the failure counter BEFORE - * publishing, so a stale timer armed by a prior failure cannot fire over a live context. */ + /* bootstrap succeeded: a->kcr3/a->mem (the gva_write TARGET / RW-hold) were set on the + * worker (mc_bootstrap_armed); the loop must NOT also write a->kcr3 (it would race an + * in-flight write — same FIFO thread owns it). MEMWRITE is now possible. cur_pod.kcr3 is + * loop-only (delivery) and is set inside mc_publish_ctx. + * + * Cancel any pending retry and reset the failure counter BEFORE publishing, so a stale + * timer armed by a prior failure cannot fire over a live context. */ a->boot_attempts = 0; mc_disarm_retry(a); - memset(&a->cur_pod, 0, sizeof a->cur_pod); - a->cur_pod.kcr3 = rs.kcr3; - a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE; - a->cur_pod.flags = VMSIG_MEMCTX_RDONLY; - a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */ - a->cur_segs[0].gpa = 0; - a->cur_segs[0].len = a->cur_pod.low; - a->cur_segs[0].file_off = 0; - a->cur_pod.nseg = a->cur_nseg; - a->have_ctx = 1; + /* Publish only if a RESUME has not already published this same context (same kcr3): a + * parallel cold bootstrap after a RESUME hit must acquire the RW-hold WITHOUT emitting a + * redundant MEMCTX. First-time publication otherwise. */ + if (!a->have_ctx) + mc_publish_ctx(a, rs.kcr3); - /* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */ - vmsig_event up; - memset(&up, 0, sizeof up); - up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; - up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; - memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod); - a->emit.emit(a->emit.token, &up); + /* Cache the freshly-scanned kcr3 for the next daemon restart (best-effort; the datum is + * already published). Only the cold scan writes the persist — never the RESUME path (its + * kcr3 came from the file). Gated on persist_path presence: production stub paths get a + * NULL persist_path from discovery, so they never write; a test may supply one to exercise + * the persist mechanics (the stub bootstrap yields a synthetic-but-stable kcr3). */ + if (a->persist_path && *a->persist_path) + (void)mc_persist_save(a->persist_path, rs.kcr3); } return 0; } diff --git a/src/discovery/discovery.c b/src/discovery/discovery.c index 25b4328..c7d9757 100644 --- a/src/discovery/discovery.c +++ b/src/discovery/discovery.c @@ -60,6 +60,10 @@ struct vmsig_discovery { * writes these at attach; the vmhost seam borrows them to add input-linux objects. Same * lifetime discipline as ep_facts (outlives the deferred adapter reap). */ struct { char evdev_a[64]; char evdev_b[64]; } ep_bridge[VMSIG_SLOT_COUNT]; + /* Stable per-endpoint home for the memctx kcr3-cache path (sibling of .slots in the watch + * dir). The memctx adapter keeps the pointer across its lifetime; same lifetime discipline + * as ep_facts/ep_bridge (outlives the deferred adapter reap, overwritten on next attach). */ + char ep_persist[VMSIG_SLOT_COUNT][DISC_PATH_MAX + 32]; }; static uint64_t now_ns(void) { @@ -269,14 +273,25 @@ static void bootstrap_scan(vmsig_discovery* d) { static int default_attach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint, const vmsig_host_facts* f) { - (void)vmid; vmsig_discovery* d = ud; /* default sink carries the discovery handle (ep_bridge home) */ char* ev_a = d ? d->ep_bridge[endpoint].evdev_a : NULL; char* ev_b = d ? d->ep_bridge[endpoint].evdev_b : NULL; if (d) { ev_a[0] = '\0'; ev_b[0] = '\0'; } /* clear stale paths from a prior attach */ + /* Form the kcr3-cache path (per-vmid, sibling of .slots/the RAM file in the watch dir). + * Gated on d->persist — one policy for all ephemeral watch-dir state. NULL => persist off. */ + const char* persist_path = NULL; + if (d && d->persist) { + int pn = snprintf(d->ep_persist[endpoint], sizeof d->ep_persist[endpoint], + "%s/.kcr3-vm-%u", d->watch_dir, vmid); + /* only enable the cache if the path fit (a truncated path would point elsewhere). */ + if (pn > 0 && (size_t)pn < sizeof d->ep_persist[endpoint]) + persist_path = d->ep_persist[endpoint]; + } + vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); mc.stub = 0; mc.ram_path = f->ram_path; mc.low = f->low; mc.ro_fd = -1; + mc.persist_path = persist_path; vmsig_input_cfg in; memset(&in, 0, sizeof in); /* input is uinput; power/lifecycle via the vmhost seam. The adapter publishes its uinput * evdev paths into ep_bridge so the vmhost seam can forward them via input-linux. */ diff --git a/src/test/test_memctx.c b/src/test/test_memctx.c index b0a6601..fb40da6 100644 --- a/src/test/test_memctx.c +++ b/src/test/test_memctx.c @@ -428,6 +428,130 @@ static void test_retry(void) { vmsig_ctx_free(ctx); } +/* ---- 8-11. kcr3-persist MECHANICS (stub) ---------------------------------- * + * These exercise the persist MACHINERY only: save/load, corruption fail-soft, drop-on- + * invalidate, and the fast-vs-slow path selection. They do NOT exercise the real boot-session + * validation (vmie_win32_open_ro_fd rejecting a stale kcr3) — that is VMIE-dependent and is + * covered only on the armed stand. Under the stub, MC_JOB_RESUME synthetically ACCEPTS any + * nonzero kcr3 (there is no live RAM to validate against), so a successful RESUME here proves + * the mechanism wired the cached kcr3 into a publication, NOT that the kcr3 was validated. */ + +static int file_exists(const char* path) { return access(path, F_OK) == 0; } + +/* Run a memctx endpoint to its first MEMCTX (or the ticks failsafe) over a private core. */ +static void run_once(uint64_t* out_kcr3, int* out_memctx, const char* persist_path, + uint32_t fail_boots) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder h; memset(&h, 0, sizeof h); + h.core = core; h.is_driver = 1; h.expect_ep = 0; h.stop_epoch = -1; + add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)"); + + vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); + mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = -1; + mc.fail_boots = fail_boots; mc.persist_path = persist_path; + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx"); + + vmsig_core_run(core); + if (out_kcr3) *out_kcr3 = h.last_kcr3; + if (out_memctx) *out_memctx = h.memctx; + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* 8. save-then-resume: run1 (cold stub bootstrap) publishes MEMCTX and WRITES the cache; run2 + * over the SAME persist_path takes the RESUME fast-path. The KEY is fail_boots=large in run2: + * if it had gone through a cold bootstrap it would have failed N times (no MEMCTX inside the + * loop budget); a prompt MEMCTX carrying the SAVED kcr3 proves RESUME bypassed the bootstrap. */ +static void test_persist_save_then_resume(void) { + printf("test_persist_save_then_resume\n"); + char path[256]; + snprintf(path, sizeof path, "/tmp/vmsig-kcrx-%d.bin", (int)getpid()); + unlink(path); + + uint64_t k1 = 0; int m1 = 0; + run_once(&k1, &m1, path, 0); + CHECK(m1 >= 1, "run1 published MEMCTX"); + CHECK(k1 != 0, "run1 kcr3 nonzero"); + CHECK(file_exists(path), "run1 wrote the kcr3 cache file"); + + /* run2: a cold bootstrap would fail 1000 times — only RESUME can publish promptly. */ + uint64_t k2 = 0; int m2 = 0; + run_once(&k2, &m2, path, 1000); + CHECK(m2 >= 1, "run2 published MEMCTX via the RESUME fast-path (bootstrap would have failed)"); + CHECK(k2 == k1, "run2 published the SAVED kcr3 (resumed from cache, not a fresh scan)"); + + unlink(path); +} + +/* 9. corrupt file => load fail-soft => cold bootstrap still brings the context up. */ +static void test_persist_corrupt(void) { + printf("test_persist_corrupt\n"); + char path[256]; + snprintf(path, sizeof path, "/tmp/vmsig-kcrx-corrupt-%d.bin", (int)getpid()); + int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0600); + CHECK(fd >= 0, "created a corrupt cache file"); + if (fd >= 0) { (void)!write(fd, "x", 1); close(fd); } /* 1 byte: short/wrong magic */ + + uint64_t k = 0; int m = 0; + run_once(&k, &m, path, 0); /* load miss => cold bootstrap (fail_boots=0 => succeeds) */ + CHECK(m >= 1, "MEMCTX still published after a corrupt cache (fail-soft load)"); + CHECK(k != 0, "kcr3 nonzero from the cold bootstrap"); + + unlink(path); +} + +/* 10. invalidate drops the cache; the re-bootstrap on the new epoch rewrites it fresh. */ +static void test_persist_invalidate_drop(void) { + printf("test_persist_invalidate_drop\n"); + char path[256]; + snprintf(path, sizeof path, "/tmp/vmsig-kcrx-inv-%d.bin", (int)getpid()); + unlink(path); + + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder h; memset(&h, 0, sizeof h); + /* inject a destructive lifecycle on epoch0 (as test_epoch); stop after epoch1. */ + h.core = core; h.is_driver = 1; h.expect_ep = 0; h.inject_reset = 1; h.stop_epoch = 1; + add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)"); + vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); + mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = -1; mc.persist_path = path; + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx"); + + vmsig_core_run(core); + + /* epoch0 bootstrap wrote the cache; invalidate dropped it; epoch1 bootstrap rewrote it. */ + CHECK(h.invalidated >= 1, "invalidation fired"); + CHECK(h.last_epoch == 1, "re-published at epoch 1 after invalidate"); + CHECK(file_exists(path), "cache rewritten by the post-invalidate bootstrap"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + unlink(path); +} + +/* 11. persist disabled (persist_path=NULL): no cache file is ever created (today's behavior). */ +static void test_persist_stub_disabled(void) { + printf("test_persist_stub_disabled\n"); + char path[256]; + snprintf(path, sizeof path, "/tmp/vmsig-kcrx-off-%d.bin", (int)getpid()); + unlink(path); + + uint64_t k = 0; int m = 0; + run_once(&k, &m, NULL, 0); /* persist off */ + CHECK(m >= 1, "MEMCTX published with persist disabled"); + CHECK(!file_exists(path), "no cache file created when persist is disabled"); + + unlink(path); /* belt-and-braces */ +} + int main(void) { test_multicast(); test_epoch(); @@ -436,6 +560,10 @@ int main(void) { test_socket(); test_ro_fd_ownership(); test_retry(); + test_persist_save_then_resume(); + test_persist_corrupt(); + test_persist_invalidate_drop(); + test_persist_stub_disabled(); printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS"); return g_fail ? 1 : 0; }