feat(memctx): persist kcr3 to fast-restart without a cold rescan

The cold host_bootstrap hunts the agent beacon across physical RAM and is
slow and unstable: after a restart the adapter re-scans from scratch, minutes
in which there is no address-space context to vend, though the guest is long
booted and its System DTB (kcr3) is unchanged.

Cache the kcr3 from a successful live scan in a watch-dir sibling of the slot
map (tmpfs: survives a restart, dies with the RAM file on host reboot). On
attach, re-validate the cached kcr3 against the live RAM via an O_RDONLY
context (open_ro_fd, which bypasses the beacon scan) plus a System-cr3 match,
and publish the read datum immediately when it still resolves the kernel. A
guest reboot changes the System DTB, so a stale kcr3 no longer resolves and
falls back to a cold scan: the boot-session discriminator is the kcr3 itself,
not file metadata.

The gva_write target is never taken from the cache: it is set only by a fresh
live scan, so a persisted kcr3 is a read locator only and MEMWRITE stays
fail-closed until a cold bootstrap acquires the write hold.

Persist is off unless the path is supplied (NULL keeps current behaviour).

Bump 0.3.12.
This commit is contained in:
2026-06-24 22:01:27 +03:00
parent 7ab6119b1f
commit bcf5d4f824
5 changed files with 370 additions and 30 deletions
+1 -1
View File
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.16) cmake_minimum_required(VERSION 3.16)
# Single source of truth for the version: CI passes -DVMSIG_VERSION=${TAG#v}, so the project # Single source of truth for the version: CI passes -DVMSIG_VERSION=${TAG#v}, so the project
# version (-> libvgpu-perception SONAME/.so version) and the .deb version come from one tag. # version (-> libvgpu-perception SONAME/.so version) and the .deb version come from one tag.
set(VMSIG_VERSION "0.3.11" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag") set(VMSIG_VERSION "0.3.12" CACHE STRING "Release version (MAJOR.MINOR.PATCH); CI passes the tag")
project(vmsig VERSION ${VMSIG_VERSION} LANGUAGES C) project(vmsig VERSION ${VMSIG_VERSION} LANGUAGES C)
set(CMAKE_C_STANDARD 17) set(CMAKE_C_STANDARD 17)
+7
View File
@@ -15,6 +15,13 @@ typedef struct {
uint32_t fail_boots; /* test-only: fail the first N stub bootstraps before */ uint32_t fail_boots; /* test-only: fail the first N stub bootstraps before */
/* succeeding (drives the retry/backoff path deterministically */ /* succeeding (drives the retry/backoff path deterministically */
/* without timing dependence); 0 in production. stub path only. */ /* without timing dependence); 0 in production. stub path only. */
const char* persist_path; /* armed: path to the kcr3 cache file (sibling of .slots in the */
/* watch dir, tmpfs-local: survives a daemon restart, dies with the */
/* RAM file on host reboot). NULL/empty => persist disabled (cold */
/* bootstrap only). The boot-session discriminator is the kcr3 */
/* itself: on resume it is validated against live RAM via */
/* vmie_win32_open_ro_fd (NULL if it no longer resolves the kernel) */
/* — a stale kcr3 after a guest reboot is rejected, fail-closed. */
} vmsig_memctx_cfg; } vmsig_memctx_cfg;
/* Max SRC bytes per atomic gva_write (bounds the worker POD slot; mc_req header + src /* Max SRC bytes per atomic gva_write (bounds the worker POD slot; mc_req header + src
+218 -28
View File
@@ -26,6 +26,7 @@
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/epoll.h> #include <sys/epoll.h>
#include <sys/timerfd.h> /* one-shot backoff timer for cold-bootstrap retry */ #include <sys/timerfd.h> /* one-shot backoff timer for cold-bootstrap retry */
#include <sys/stat.h> /* persist file mode bits (0600) */
#ifdef VMSIG_WITH_VMIE #ifdef VMSIG_WITH_VMIE
#include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */ #include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */
@@ -67,7 +68,82 @@ static int memfd_create(const char* name, unsigned int flags) {
* eventfd, slot 1 is the one-shot backoff timerfd that re-kicks the bootstrap. */ * eventfd, slot 1 is the one-shot backoff timerfd that re-kicks the bootstrap. */
enum { MC_COOKIE_WORKER = 0, MC_COOKIE_RETRY = 1 }; enum { MC_COOKIE_WORKER = 0, MC_COOKIE_RETRY = 1 };
enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 }; /* MC_JOB_RESUME: fast-path boot-session re-validation. On a daemon restart the cold scan
* (host_bootstrap) is slow AND unstable (it hunts the agent beacon across physical RAM); if
* the guest did NOT reboot, its System DTB (kcr3) is unchanged and was cached at the last
* live scan. RESUME re-opens an O_RDONLY context with that cached kcr3 (vmie_win32_open_ro_fd,
* which bypasses the beacon scan) — the boot-session discriminator is the kcr3 ITSELF against
* the live RAM: it resolves the kernel (ntoskrnl) only if the guest is the same boot. */
enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1, MC_JOB_RESUME = 2 };
/* ---- kcr3 context persist: a cache of the cold-bootstrap result, mirror of the .slots
* idiom in src/discovery/slot.c (magic+version POD, native byte order, atomic tmp+rename,
* fail-soft load). Deliberately NOT factored into a shared helper: discovery (vmid<->slot)
* and this adapter (kcr3 cache) are different layers with different lifecycles — Rule-of-three
* is not reached, and a shared helper would couple the two prematurely.
*
* We persist the MINIMUM: only {magic, version, kcr3}. NO RAM metadata (st_ino/size/mtime/
* btime): those do NOT prove the RAM holds the same boot session (the backing file outlives a
* memory overwrite, the inode can be reused). The boot-session discriminator is the kcr3
* self-validating against the live RAM at load time (see MC_JOB_RESUME), not file metadata.
*
* MEMWRITE-target safety: a persisted kcr3 is a READ locator only. The write target (a->kcr3)
* is set ONLY by the bootstrap worker after a fresh live scan — never from this file. */
#define MC_PERSIST_MAGIC 0x4B435258u /* "KCRX" — kcr3 context cache */
#define MC_PERSIST_VERSION 1u
typedef struct {
uint32_t magic;
uint32_t version;
uint64_t kcr3; /* System DTB obtained from a live RAM scan; validated by open_ro_fd */
} mc_persist_blob;
/* Atomic save: write a temp sibling then rename over the target, so a reader (or a racing
* second daemon) sees either the whole old file or the whole new one. Loop-thread-only.
* Returns 0 on success, -1 otherwise (best-effort: the datum is already published). */
static int mc_persist_save(const char* path, uint64_t kcr3) {
if (!path || !*path) return -1;
mc_persist_blob b;
memset(&b, 0, sizeof b);
b.magic = MC_PERSIST_MAGIC; b.version = MC_PERSIST_VERSION; b.kcr3 = kcr3;
char tmp[512];
int n = snprintf(tmp, sizeof tmp, "%s.tmp", path);
if (n < 0 || (size_t)n >= sizeof tmp) return -1;
int fd = open(tmp, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0600);
if (fd < 0) return -1;
ssize_t w = write(fd, &b, sizeof b);
int rc = (w == (ssize_t)sizeof b) ? 0 : -1;
if (close(fd) != 0) rc = -1;
if (rc == 0 && rename(tmp, path) != 0) rc = -1;
if (rc != 0) unlink(tmp);
return rc;
}
/* Load + validate the POD header. Loop-thread-only. Returns 1 if a well-formed blob was read
* (out filled), 0 otherwise (no file / short / wrong magic or version => fail-soft, fall back
* to a cold bootstrap). No migrations: an old version is ignored and overwritten by the next
* live scan result. NOTE: this validates only the file SHAPE; the kcr3 itself is validated
* against live RAM on the worker (MC_JOB_RESUME), which is the real boot-session discriminator. */
static int mc_persist_load(const char* path, mc_persist_blob* out) {
if (!path || !*path) return 0;
int fd = open(path, O_RDONLY | O_CLOEXEC);
if (fd < 0) return 0; /* no file => cold bootstrap */
mc_persist_blob b;
ssize_t r = read(fd, &b, sizeof b);
close(fd);
if (r != (ssize_t)sizeof b || b.magic != MC_PERSIST_MAGIC || b.version != MC_PERSIST_VERSION)
return 0; /* corrupt/old => cold bootstrap */
*out = b;
return 1;
}
/* Drop the cache on a destructive VM-lifecycle (the RAM may have changed). Best-effort.
* Hygiene only: even without the drop a stale kcr3 would be rejected by the self-validation,
* but we do not leave a known-dead file around. Loop-thread-only. */
static void mc_persist_drop(const char* path) {
if (path && *path) unlink(path);
}
/* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold /* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold
* bootstrap and the atomic writes (FIFO serializes a write against the close-on-rebootstrap). * bootstrap and the atomic writes (FIFO serializes a write against the close-on-rebootstrap).
@@ -80,8 +156,9 @@ typedef struct {
uint32_t attempt; /* MC_JOB_BOOTSTRAP: consecutive-failure index of THIS */ uint32_t attempt; /* MC_JOB_BOOTSTRAP: consecutive-failure index of THIS */
/* kick (copy of a->boot_attempts); stub fails while */ /* kick (copy of a->boot_attempts); stub fails while */
/* attempt < a->fail_boots. NOT the epoch counter. */ /* attempt < a->fail_boots. NOT the epoch counter. */
/* --- MC_JOB_WRITE --- */ /* --- MC_JOB_WRITE / MC_JOB_RESUME --- */
uint64_t cr3; /* target AS root; 0 => a->kcr3 (kernel AS), resolved on worker */ uint64_t cr3; /* WRITE: target AS root (0 => a->kcr3); RESUME: persisted kcr3 to validate */
uint64_t low; /* MC_JOB_RESUME: below-4G split for vmie_win32_open_ro_fd (ignored by others) */
uint64_t gva; uint64_t gva;
uint32_t len; uint32_t len;
uint32_t corr; uint32_t corr;
@@ -100,6 +177,7 @@ struct vmsig_adapter {
uint32_t endpoint; uint32_t endpoint;
int stub; int stub;
const char* ram_path; /* armed: RAM-backing path (NOT published outward) */ const char* ram_path; /* armed: RAM-backing path (NOT published outward) */
const char* persist_path; /* armed: kcr3 cache file path (cfg, loop-thread-only); NULL => persist off */
uint64_t low; uint64_t low;
int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (owned by adapter, closed in mc_close); <0 => default */ int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (owned by adapter, closed in mc_close); <0 => default */
vmsig_emit emit; vmsig_emit emit;
@@ -232,6 +310,49 @@ static int mc_job(void* user, const void* req, void* res) {
#endif #endif
} }
if (rq->op == MC_JOB_RESUME) {
/* Fast-path boot-session re-validation: open an O_RDONLY context with the PERSISTED
* kcr3 and let the engine decide if it still resolves the kernel in the LIVE RAM.
* This is purely a READ validation — it NEVER touches a->win/a->mem/a->kcr3 (the
* RW write-hold, owned by the bootstrap worker after a fresh live scan). MEMWRITE-
* target safety: a persisted kcr3 must never become the gva_write target. */
if (a->stub) {
/* No VMIE here, so there is no real RAM to validate against: synthetically ACCEPT a
* nonzero kcr3 so the stub can exercise the persist MECHANICS (save/load/fast-vs-slow
* selection). This is NOT real boot-session validation — that is armed-only. */
if (rq->cr3 == 0) return -1;
rs->kcr3 = rq->cr3;
return 0;
}
#ifdef VMSIG_WITH_VMIE
/* fresh O_RDONLY fd over the backing (same source as mc_reg_share_fd: dup the infra
* RO-fd, else open ram_path O_RDONLY). The RO context borrows it (dup'd internally),
* so we close our copy after open. */
int rfd;
if (a->cfg_ro_fd >= 0) rfd = fcntl(a->cfg_ro_fd, F_DUPFD_CLOEXEC, 0);
else if (a->ram_path) rfd = open(a->ram_path, O_RDONLY | O_CLOEXEC);
else return -1;
if (rfd < 0) return -1;
vmie_win32* v = vmie_win32_open_ro_fd(rfd, rq->low, rq->cr3);
close(rfd); /* borrowed by open_ro_fd (dup'd internally) */
if (!v) return -1; /* kcr3 no longer resolves the kernel => stale/guest-reboot */
/* Second, independent signal: the System process must be present AND its cr3 must equal
* the persisted kcr3 (the System DTB by definition). Catches the pathology "kcr3 resolves
* a DIFFERENT kernel". Cheap — the RO context is already built. Fail-closed on mismatch. */
process procs[16];
int n = proc_list(v, 0, procs, 16);
int system_ok = 0;
for (int i = 0; i < n && i < 16; i++)
if (!strcmp(procs[i].name, "System")) { system_ok = (procs[i].cr3 == rq->cr3); break; }
vmie_win32_close(v); /* validation-only: the read datum needs no held handle */
if (!system_ok) return -1;
rs->kcr3 = rq->cr3; /* validated: publish the read datum (NOT a->kcr3) */
return 0;
#else
return -1; /* armed without the build flag: resume impossible -> cold bootstrap */
#endif
}
/* MC_JOB_BOOTSTRAP */ /* MC_JOB_BOOTSTRAP */
if (a->stub) { if (a->stub) {
/* test-only: fail the first fail_boots attempts to exercise the retry path /* test-only: fail the first fail_boots attempts to exercise the retry path
@@ -259,6 +380,45 @@ static void mc_kick_bootstrap(struct vmsig_adapter* a) {
(void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */ (void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */
} }
/* Submit the fast-path RESUME (off-loop: open_ro_fd reads image pages, not on the loop thread).
* Carries the persisted kcr3 + the cfg low for vmie_win32_open_ro_fd. On miss/validation-fail the
* completion handler falls back to a cold bootstrap — the persist never replaces it. */
static void mc_kick_resume(struct vmsig_adapter* a, uint64_t kcr3) {
mc_req rq;
memset(&rq, 0, sizeof rq);
rq.op = MC_JOB_RESUME; rq.cr3 = kcr3; rq.low = a->low;
(void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */
}
/* Single publication path for BOTH RESUME and BOOTSTRAP (no two ways to publish a MEMCTX).
* Assembles the single-low locator from `kcr3` + a->low, marks have_ctx, and emits the MEMCTX
* trigger; the core authoritatively re-describes and stamps the epoch. Loop-thread-only.
*
* Ownership: this writes kcr3 ONLY into cur_pod.kcr3 (the delivery copy). It does NOT touch
* a->kcr3 — that is the gva_write TARGET, owned solely by the bootstrap worker. The difference
* between the two callers is only the SOURCE of kcr3 and whether an RW-hold / persist-save
* follows; the locator assembly itself is shared here. */
static void mc_publish_ctx(struct vmsig_adapter* a, uint64_t kcr3) {
memset(&a->cur_pod, 0, sizeof a->cur_pod);
a->cur_pod.kcr3 = kcr3;
a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE;
a->cur_pod.flags = VMSIG_MEMCTX_RDONLY;
a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */
a->cur_segs[0].gpa = 0;
a->cur_segs[0].len = a->cur_pod.low;
a->cur_segs[0].file_off = 0;
a->cur_pod.nseg = a->cur_nseg;
a->have_ctx = 1;
/* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */
vmsig_event up;
memset(&up, 0, sizeof up);
up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP;
up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint;
memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod);
a->emit.emit(a->emit.token, &up);
}
/* ---- reg hooks (vmsig_memctx_reg.ctx = a; called by the core on the loop thread) ---- */ /* ---- reg hooks (vmsig_memctx_reg.ctx = a; called by the core on the loop thread) ---- */
static void mc_reg_describe(void* ctx, vmsig_memctx* out_pod, static void mc_reg_describe(void* ctx, vmsig_memctx* out_pod,
const vmsig_memseg** out_segs, uint32_t* out_nseg) { const vmsig_memseg** out_segs, uint32_t* out_nseg) {
@@ -286,6 +446,10 @@ static void mc_reg_invalidate(void* ctx, uint32_t epoch) {
struct vmsig_adapter* a = ctx; struct vmsig_adapter* a = ctx;
(void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */ (void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */
a->have_ctx = 0; /* the previous context is invalid */ a->have_ctx = 0; /* the previous context is invalid */
/* destructive VM-lifecycle => the RAM may have changed => drop the kcr3 cache so the next
* restart cannot fast-path off a now-dead kcr3 (the self-validation would reject it anyway,
* but we do not leave a known-stale file). Best-effort, loop-thread-only. */
mc_persist_drop(a->persist_path);
/* new cycle: drop a stale arm from the previous cycle and restart the failure counter at /* new cycle: drop a stale arm from the previous cycle and restart the failure counter at
* zero so this bootstrap's backoff starts fresh (and the first-failure diagnostic re-arms). */ * zero so this bootstrap's backoff starts fresh (and the first-failure diagnostic re-arms). */
a->boot_attempts = 0; a->boot_attempts = 0;
@@ -307,6 +471,7 @@ static vmsig_adapter* mc_open(const void* cfg, uint32_t endpoint) {
a->stub_fd = -1; a->stub_fd = -1;
a->retry_fd = -1; a->retry_fd = -1;
a->fail_boots = c ? c->fail_boots : 0; /* set once; read-only afterwards (worker reads) */ a->fail_boots = c ? c->fail_boots : 0; /* set once; read-only afterwards (worker reads) */
a->persist_path = c ? c->persist_path : NULL; /* NULL => persist disabled (cold bootstrap only) */
return a; return a;
} }
@@ -364,7 +529,16 @@ static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg
up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint;
a->emit.emit(a->emit.token, &up); a->emit.emit(a->emit.token, &up);
mc_kick_bootstrap(a); /* first bootstrap off-loop; assemble the locator on completion */ /* Fast-path: if a kcr3 cache exists, try a RESUME (re-validate it against live RAM) BEFORE
* the cold scan. On a daemon restart over an unchanged guest this publishes the read datum
* in milliseconds instead of minutes of beacon-scan retry. On any miss (persist off / stub /
* no file / corrupt) we fall straight into the existing cold bootstrap. The RW-hold for
* MEMWRITE is still acquired by a cold bootstrap (kicked in parallel after a RESUME hit). */
mc_persist_blob b;
if (a->persist_path && *a->persist_path && mc_persist_load(a->persist_path, &b))
mc_kick_resume(a, b.kcr3); /* validate the cached kcr3 off-loop; cold fallback on miss */
else
mc_kick_bootstrap(a); /* first cold bootstrap off-loop; assemble locator on completion */
return 2; /* worker eventfd + backoff timerfd */ return 2; /* worker eventfd + backoff timerfd */
} }
@@ -391,6 +565,27 @@ static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) {
mc_memwrite_ack(a, rs.ok && rc == 0, rs.corr, rs.origin); mc_memwrite_ack(a, rs.ok && rc == 0, rs.corr, rs.origin);
continue; continue;
} }
if (rs.op == MC_JOB_RESUME) {
/* Fast-path completion. The persisted kcr3 was validated against the LIVE RAM on the
* worker (open_ro_fd != NULL [+ System-cr3 match]) — the read datum is safe to publish.
* Note: the worker did NOT set a->kcr3/a->win/a->mem (the RW write-hold), so MEMWRITE
* stays ok=0 until a cold bootstrap acquires it. */
if (rc == 0) {
mc_publish_ctx(a, rs.kcr3); /* video lives instantly (read datum), epoch by core */
mc_kick_bootstrap(a); /* in parallel: acquire the RW-hold (a->kcr3) for MEMWRITE */
/* Do NOT save the persist (the kcr3 came FROM the file) and do NOT arm a retry
* (the read datum is up; the parallel bootstrap arms its own retry on failure). */
} else {
/* validation miss: the persisted kcr3 no longer resolves the kernel (guest rebooted
* or corrupt). Fall back to an honest cold scan; on success it rewrites the persist
* with a fresh kcr3. Do NOT retry the RESUME — the cache is under suspicion. */
mc_kick_bootstrap(a);
}
continue;
}
/* MC_JOB_BOOTSTRAP */
if (rc != 0) { if (rc != 0) {
/* bootstrap failed: the guest is likely still booting (host_bootstrap found no /* bootstrap failed: the guest is likely still booting (host_bootstrap found no
* System process). This is NOT a control-level error — do NOT emit VMSIG_EV_ERROR * System process). This is NOT a control-level error — do NOT emit VMSIG_EV_ERROR
@@ -405,34 +600,29 @@ static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) {
mc_arm_retry(a); /* one-shot timer at mc_boot_backoff(boot_attempts) */ mc_arm_retry(a); /* one-shot timer at mc_boot_backoff(boot_attempts) */
continue; continue;
} }
/* assemble the locator on the loop thread from rs.kcr3. a->kcr3 is the gva_write /* bootstrap succeeded: a->kcr3/a->mem (the gva_write TARGET / RW-hold) were set on the
* TARGET and is owned SOLELY by the worker thread (set in mc_bootstrap_armed, read by * worker (mc_bootstrap_armed); the loop must NOT also write a->kcr3 (it would race an
* MC_JOB_WRITE — same thread, FIFO happens-before); the loop must NOT also write it, or * in-flight write — same FIFO thread owns it). MEMWRITE is now possible. cur_pod.kcr3 is
* an in-flight write at line ~170 would race it. cur_pod.kcr3 is loop-only (delivery). */ * loop-only (delivery) and is set inside mc_publish_ctx.
*
/* bootstrap succeeded: cancel any pending retry and reset the failure counter BEFORE * Cancel any pending retry and reset the failure counter BEFORE publishing, so a stale
* publishing, so a stale timer armed by a prior failure cannot fire over a live context. */ * timer armed by a prior failure cannot fire over a live context. */
a->boot_attempts = 0; a->boot_attempts = 0;
mc_disarm_retry(a); mc_disarm_retry(a);
memset(&a->cur_pod, 0, sizeof a->cur_pod); /* Publish only if a RESUME has not already published this same context (same kcr3): a
a->cur_pod.kcr3 = rs.kcr3; * parallel cold bootstrap after a RESUME hit must acquire the RW-hold WITHOUT emitting a
a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE; * redundant MEMCTX. First-time publication otherwise. */
a->cur_pod.flags = VMSIG_MEMCTX_RDONLY; if (!a->have_ctx)
a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */ mc_publish_ctx(a, rs.kcr3);
a->cur_segs[0].gpa = 0;
a->cur_segs[0].len = a->cur_pod.low;
a->cur_segs[0].file_off = 0;
a->cur_pod.nseg = a->cur_nseg;
a->have_ctx = 1;
/* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */ /* Cache the freshly-scanned kcr3 for the next daemon restart (best-effort; the datum is
vmsig_event up; * already published). Only the cold scan writes the persist — never the RESUME path (its
memset(&up, 0, sizeof up); * kcr3 came from the file). Gated on persist_path presence: production stub paths get a
up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; * NULL persist_path from discovery, so they never write; a test may supply one to exercise
up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; * the persist mechanics (the stub bootstrap yields a synthetic-but-stable kcr3). */
memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod); if (a->persist_path && *a->persist_path)
a->emit.emit(a->emit.token, &up); (void)mc_persist_save(a->persist_path, rs.kcr3);
} }
return 0; return 0;
} }
+16 -1
View File
@@ -60,6 +60,10 @@ struct vmsig_discovery {
* writes these at attach; the vmhost seam borrows them to add input-linux objects. Same * writes these at attach; the vmhost seam borrows them to add input-linux objects. Same
* lifetime discipline as ep_facts (outlives the deferred adapter reap). */ * lifetime discipline as ep_facts (outlives the deferred adapter reap). */
struct { char evdev_a[64]; char evdev_b[64]; } ep_bridge[VMSIG_SLOT_COUNT]; struct { char evdev_a[64]; char evdev_b[64]; } ep_bridge[VMSIG_SLOT_COUNT];
/* Stable per-endpoint home for the memctx kcr3-cache path (sibling of .slots in the watch
* dir). The memctx adapter keeps the pointer across its lifetime; same lifetime discipline
* as ep_facts/ep_bridge (outlives the deferred adapter reap, overwritten on next attach). */
char ep_persist[VMSIG_SLOT_COUNT][DISC_PATH_MAX + 32];
}; };
static uint64_t now_ns(void) { static uint64_t now_ns(void) {
@@ -269,14 +273,25 @@ static void bootstrap_scan(vmsig_discovery* d) {
static int default_attach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint, static int default_attach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint,
const vmsig_host_facts* f) { const vmsig_host_facts* f) {
(void)vmid;
vmsig_discovery* d = ud; /* default sink carries the discovery handle (ep_bridge home) */ vmsig_discovery* d = ud; /* default sink carries the discovery handle (ep_bridge home) */
char* ev_a = d ? d->ep_bridge[endpoint].evdev_a : NULL; char* ev_a = d ? d->ep_bridge[endpoint].evdev_a : NULL;
char* ev_b = d ? d->ep_bridge[endpoint].evdev_b : NULL; char* ev_b = d ? d->ep_bridge[endpoint].evdev_b : NULL;
if (d) { ev_a[0] = '\0'; ev_b[0] = '\0'; } /* clear stale paths from a prior attach */ if (d) { ev_a[0] = '\0'; ev_b[0] = '\0'; } /* clear stale paths from a prior attach */
/* Form the kcr3-cache path (per-vmid, sibling of .slots/the RAM file in the watch dir).
* Gated on d->persist — one policy for all ephemeral watch-dir state. NULL => persist off. */
const char* persist_path = NULL;
if (d && d->persist) {
int pn = snprintf(d->ep_persist[endpoint], sizeof d->ep_persist[endpoint],
"%s/.kcr3-vm-%u", d->watch_dir, vmid);
/* only enable the cache if the path fit (a truncated path would point elsewhere). */
if (pn > 0 && (size_t)pn < sizeof d->ep_persist[endpoint])
persist_path = d->ep_persist[endpoint];
}
vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc);
mc.stub = 0; mc.ram_path = f->ram_path; mc.low = f->low; mc.ro_fd = -1; mc.stub = 0; mc.ram_path = f->ram_path; mc.low = f->low; mc.ro_fd = -1;
mc.persist_path = persist_path;
vmsig_input_cfg in; memset(&in, 0, sizeof in); vmsig_input_cfg in; memset(&in, 0, sizeof in);
/* input is uinput; power/lifecycle via the vmhost seam. The adapter publishes its uinput /* input is uinput; power/lifecycle via the vmhost seam. The adapter publishes its uinput
* evdev paths into ep_bridge so the vmhost seam can forward them via input-linux. */ * evdev paths into ep_bridge so the vmhost seam can forward them via input-linux. */
+128
View File
@@ -428,6 +428,130 @@ static void test_retry(void) {
vmsig_ctx_free(ctx); vmsig_ctx_free(ctx);
} }
/* ---- 8-11. kcr3-persist MECHANICS (stub) ---------------------------------- *
* These exercise the persist MACHINERY only: save/load, corruption fail-soft, drop-on-
* invalidate, and the fast-vs-slow path selection. They do NOT exercise the real boot-session
* validation (vmie_win32_open_ro_fd rejecting a stale kcr3) — that is VMIE-dependent and is
* covered only on the armed stand. Under the stub, MC_JOB_RESUME synthetically ACCEPTS any
* nonzero kcr3 (there is no live RAM to validate against), so a successful RESUME here proves
* the mechanism wired the cached kcr3 into a publication, NOT that the kcr3 was validated. */
static int file_exists(const char* path) { return access(path, F_OK) == 0; }
/* Run a memctx endpoint to its first MEMCTX (or the ticks failsafe) over a private core. */
static void run_once(uint64_t* out_kcr3, int* out_memctx, const char* persist_path,
uint32_t fail_boots) {
vmsig_ctx* ctx = vmsig_ctx_new();
vmsig_core* core = vmsig_core_new(ctx);
holder h; memset(&h, 0, sizeof h);
h.core = core; h.is_driver = 1; h.expect_ep = 0; h.stop_epoch = -1;
add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0);
CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)");
vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc);
mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = -1;
mc.fail_boots = fail_boots; mc.persist_path = persist_path;
CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx");
vmsig_core_run(core);
if (out_kcr3) *out_kcr3 = h.last_kcr3;
if (out_memctx) *out_memctx = h.memctx;
vmsig_core_free(core);
vmsig_ctx_free(ctx);
}
/* 8. save-then-resume: run1 (cold stub bootstrap) publishes MEMCTX and WRITES the cache; run2
* over the SAME persist_path takes the RESUME fast-path. The KEY is fail_boots=large in run2:
* if it had gone through a cold bootstrap it would have failed N times (no MEMCTX inside the
* loop budget); a prompt MEMCTX carrying the SAVED kcr3 proves RESUME bypassed the bootstrap. */
static void test_persist_save_then_resume(void) {
printf("test_persist_save_then_resume\n");
char path[256];
snprintf(path, sizeof path, "/tmp/vmsig-kcrx-%d.bin", (int)getpid());
unlink(path);
uint64_t k1 = 0; int m1 = 0;
run_once(&k1, &m1, path, 0);
CHECK(m1 >= 1, "run1 published MEMCTX");
CHECK(k1 != 0, "run1 kcr3 nonzero");
CHECK(file_exists(path), "run1 wrote the kcr3 cache file");
/* run2: a cold bootstrap would fail 1000 times — only RESUME can publish promptly. */
uint64_t k2 = 0; int m2 = 0;
run_once(&k2, &m2, path, 1000);
CHECK(m2 >= 1, "run2 published MEMCTX via the RESUME fast-path (bootstrap would have failed)");
CHECK(k2 == k1, "run2 published the SAVED kcr3 (resumed from cache, not a fresh scan)");
unlink(path);
}
/* 9. corrupt file => load fail-soft => cold bootstrap still brings the context up. */
static void test_persist_corrupt(void) {
printf("test_persist_corrupt\n");
char path[256];
snprintf(path, sizeof path, "/tmp/vmsig-kcrx-corrupt-%d.bin", (int)getpid());
int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0600);
CHECK(fd >= 0, "created a corrupt cache file");
if (fd >= 0) { (void)!write(fd, "x", 1); close(fd); } /* 1 byte: short/wrong magic */
uint64_t k = 0; int m = 0;
run_once(&k, &m, path, 0); /* load miss => cold bootstrap (fail_boots=0 => succeeds) */
CHECK(m >= 1, "MEMCTX still published after a corrupt cache (fail-soft load)");
CHECK(k != 0, "kcr3 nonzero from the cold bootstrap");
unlink(path);
}
/* 10. invalidate drops the cache; the re-bootstrap on the new epoch rewrites it fresh. */
static void test_persist_invalidate_drop(void) {
printf("test_persist_invalidate_drop\n");
char path[256];
snprintf(path, sizeof path, "/tmp/vmsig-kcrx-inv-%d.bin", (int)getpid());
unlink(path);
vmsig_ctx* ctx = vmsig_ctx_new();
vmsig_core* core = vmsig_core_new(ctx);
holder h; memset(&h, 0, sizeof h);
/* inject a destructive lifecycle on epoch0 (as test_epoch); stop after epoch1. */
h.core = core; h.is_driver = 1; h.expect_ep = 0; h.inject_reset = 1; h.stop_epoch = 1;
add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0);
CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)");
vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc);
mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = -1; mc.persist_path = path;
CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx");
vmsig_core_run(core);
/* epoch0 bootstrap wrote the cache; invalidate dropped it; epoch1 bootstrap rewrote it. */
CHECK(h.invalidated >= 1, "invalidation fired");
CHECK(h.last_epoch == 1, "re-published at epoch 1 after invalidate");
CHECK(file_exists(path), "cache rewritten by the post-invalidate bootstrap");
vmsig_core_free(core);
vmsig_ctx_free(ctx);
unlink(path);
}
/* 11. persist disabled (persist_path=NULL): no cache file is ever created (today's behavior). */
static void test_persist_stub_disabled(void) {
printf("test_persist_stub_disabled\n");
char path[256];
snprintf(path, sizeof path, "/tmp/vmsig-kcrx-off-%d.bin", (int)getpid());
unlink(path);
uint64_t k = 0; int m = 0;
run_once(&k, &m, NULL, 0); /* persist off */
CHECK(m >= 1, "MEMCTX published with persist disabled");
CHECK(!file_exists(path), "no cache file created when persist is disabled");
unlink(path); /* belt-and-braces */
}
int main(void) { int main(void) {
test_multicast(); test_multicast();
test_epoch(); test_epoch();
@@ -436,6 +560,10 @@ int main(void) {
test_socket(); test_socket();
test_ro_fd_ownership(); test_ro_fd_ownership();
test_retry(); test_retry();
test_persist_save_then_resume();
test_persist_corrupt();
test_persist_invalidate_drop();
test_persist_stub_disabled();
printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS"); printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS");
return g_fail ? 1 : 0; return g_fail ? 1 : 0;
} }