/* memctx.c — vmie sensor adapter: vends ONE coherent guest address-space context — * the permanent System DirectoryTableBase (`kcr3`) PAIRED with a RAM-region locator * and a pre-opened O_RDONLY fd. This is NOT perception and NOT semantics: signaling * multicasts the datum + RO-fd, while the holder (an S-lib / any control) opens ITS OWN * read-only vmie_mem from the fd and does gva_read/scan/pmap itself. * * Cold bring-up (host_bootstrap) is CPU-bound and blocking, so it runs on an off-loop * worker; the loop thread only assembles the locator on the completion-eventfd and emits * the MEMCTX trigger. The epoch is stamped by the CORE (retained-context); on an epoch * change the core calls reg.invalidate, the adapter re-bootstraps and re-emits MEMCTX. * * RO outward is physical: O_RDONLY fd => mmap(PROT_WRITE) -> EACCES, so a write into the * guest on the holder side is structurally impossible. stub mode (without VMSIG_WITH_VMIE * or ram_path==NULL) synthesizes a kcr3 and a genuinely RO-mappable fd (memfd + seal) — * the seam is provable without a VM. */ #define _GNU_SOURCE #include "vmsig_adapter.h" #include "memctx.h" #include "adapter_util.h" /* vmsig_worker (off-loop bootstrap) */ #include #include #include #include #include #include #include #include #ifdef VMSIG_WITH_VMIE #include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */ #endif /* memfd_create / seal — ABI fallbacks for old glibc/kernel (stub RO-fd backing). */ #ifndef MFD_CLOEXEC #include #include static int memfd_create(const char* name, unsigned int flags) { return (int)syscall(SYS_memfd_create, name, flags); } #endif #ifndef MFD_ALLOW_SEALING #define MFD_ALLOW_SEALING 0x0002U #endif #ifndef F_ADD_SEALS #define F_ADD_SEALS (1024 + 9) #define F_SEAL_SHRINK 0x0002 #define F_SEAL_GROW 0x0004 #endif #ifndef F_SEAL_FUTURE_WRITE #define F_SEAL_FUTURE_WRITE 0x0010 /* kernel 5.1+: forbid future writable mappings */ #endif #define MC_STUB_SIZE 0x10000u /* 64 KB of synthetic RAM image (stub) */ #define MC_MAX_SEG 8 #define MC_WORKER_DEPTH 16 /* one off-loop thread: rare bootstrap + writes */ enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 }; /* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold * bootstrap and the atomic writes (FIFO serializes a write against the close-on-rebootstrap). * boot_count drives the stub kcr3 (changes per epoch); the real guest kcr3 does NOT depend * on it (armed reads the System DTB). MC_JOB_WRITE copies SRC off-loop into req.src plus the * target cr3 (0 => System DTB; resolved on the worker against a->kcr3). */ typedef struct { uint32_t op; /* MC_JOB_* */ uint32_t boot_count; /* MC_JOB_BOOTSTRAP */ /* --- MC_JOB_WRITE --- */ uint64_t cr3; /* target AS root; 0 => a->kcr3 (kernel AS), resolved on worker */ uint64_t gva; uint32_t len; uint32_t corr; uint32_t origin; uint8_t src[VMSIG_MEMWRITE_MAX]; /* SRC bytes copied off-loop (gva_write reads this) */ } mc_req; typedef struct { uint32_t op; /* echoes the job type so on_ready demuxes */ int ok; /* MC_JOB_WRITE result */ uint32_t corr; uint32_t origin; uint64_t kcr3; /* MC_JOB_BOOTSTRAP result */ } mc_res; struct vmsig_adapter { uint32_t endpoint; int stub; const char* ram_path; /* armed: RAM-backing path (NOT published outward) */ uint64_t low; int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (owned by adapter, closed in mc_close); <0 => default */ vmsig_emit emit; int registered; /* register_memctx already called */ vmsig_worker* worker; /* off-loop bootstrap + atomic writes */ uint32_t boot_count; /* incremented on each (re-)bootstrap */ #ifdef VMSIG_WITH_VMIE vmie_win32* win; /* held RW handle across the epoch (kcr3 source + gva_write target) */ vmie_mem* mem; /* vmie_win32_mem(win); borrowed, valid until vmie_win32_close */ #endif uint64_t kcr3; /* current System DTB (also published in cur_pod.kcr3) */ /* persistent locator: owned by the loop thread; worker only yields kcr3 into scratch. */ int have_ctx; vmsig_memctx cur_pod; /* kcr3/low/nseg/flags (epoch stamped by the core) */ vmsig_memseg cur_segs[MC_MAX_SEG]; uint32_t cur_nseg; int stub_fd; /* stub: memfd of synth RAM (+seal); share_fd reopens it */ }; /* fwd: MEMWRITE completion ACK (defined below mc_submit; used in mc_on_ready demux). */ static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin); /* ---- stub RO-fd: memfd + deterministic contents + seal of future writes ---- */ static int mc_make_stub_fd(uint32_t size) { int fd = memfd_create("vmsig_memctx", MFD_CLOEXEC | MFD_ALLOW_SEALING); if (fd < 0) fd = memfd_create("vmsig_memctx", MFD_CLOEXEC); if (fd < 0) return -1; if (ftruncate(fd, (off_t)size) != 0) { close(fd); return -1; } /* deterministic contents via a temporary RW mapping BEFORE the seal */ uint8_t* p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (p != MAP_FAILED) { for (uint32_t i = 0; i < size; i++) p[i] = (uint8_t)(i & 0xFFu); munmap(p, size); } /* FUTURE_WRITE: even if the holder reopens the fd as O_RDWR, it gets no writable mapping. * best-effort (kernel 5.1+); on older kernels only the O_RDONLY fd protects. */ if (fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_FUTURE_WRITE) != 0) (void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW); return fd; } #ifdef VMSIG_WITH_VMIE /* armed bring-up: open RAM (RW is vmie's internal concern), host_bootstrap, extract the * permanent System DTB as the System process cr3 (kcr3 — the root of the guest AS). The RW * handle is HELD across the epoch (kcr3 source + gva_write target); ONLY the RO-fd (share_fd) * leaves outward — write goes through this command plane, never a writable mmap. Runs on the * off-loop worker; a stale handle from a prior epoch is dropped first (serialized FIFO with * in-flight writes). */ static int mc_bootstrap_armed(struct vmsig_adapter* a, uint64_t* out_kcr3) { if (a->win) { vmie_win32_close(a->win); a->win = NULL; a->mem = NULL; } /* drop stale epoch handle */ vmie_win32* v = vmie_win32_open(a->ram_path, a->low); if (!v) return -1; if (host_bootstrap(v) != 0) { vmie_win32_close(v); return -1; } process procs[16]; int n = proc_list(v, 0, procs, 16); uint64_t kcr3 = 0; for (int i = 0; i < n && i < 16; i++) if (!strcmp(procs[i].name, "System")) { kcr3 = procs[i].cr3; break; } if (!kcr3) { vmie_win32_close(v); return -1; } a->win = v; /* HOLD: RW handle lives across the epoch */ a->mem = vmie_win32_mem(v); /* borrowed; valid until vmie_win32_close(v) */ a->kcr3 = kcr3; *out_kcr3 = kcr3; return 0; } #endif /* ---- worker job: cold bring-up OR atomic write, off-loop ----------------- * * Demultiplexed by rq->op. BOTH run on the SAME single worker thread, so a write on the * held handle never races the close-on-rebootstrap (FIFO). The job MUST NOT touch core * structures — it only reads a->mem/a->kcr3 (stable between re-bootstraps on this thread). */ static int mc_job(void* user, const void* req, void* res) { struct vmsig_adapter* a = user; const mc_req* rq = req; mc_res* rs = res; memset(rs, 0, sizeof *rs); rs->op = rq->op; if (rq->op == MC_JOB_WRITE) { rs->corr = rq->corr; rs->origin = rq->origin; if (a->stub) { rs->ok = 1; return 0; } /* stub: ack without actuation */ #ifdef VMSIG_WITH_VMIE /* a->mem is NULL until a bootstrap has succeeded (or after one failed and cleared it): * the guard turns that into an ok=0 ACK (observable to the initiator), not a crash. * cr3 resolve is on the worker (sole owner of a->kcr3): 0 => kernel AS (System DTB). */ uint64_t target = rq->cr3 ? rq->cr3 : a->kcr3; rs->ok = (a->mem && gva_write(a->mem, (uintptr_t)target, (uintptr_t)rq->gva, rq->src, rq->len) == 0); return rs->ok ? 0 : -1; #else rs->ok = 0; return -1; /* armed without the build flag: write impossible */ #endif } /* MC_JOB_BOOTSTRAP */ if (a->stub) { rs->kcr3 = 0xC0DE0000ull + (uint64_t)rq->boot_count * 0x1000ull; /* changes per epoch */ return 0; } #ifdef VMSIG_WITH_VMIE uint64_t kcr3 = 0; if (mc_bootstrap_armed(a, &kcr3) != 0) return -1; rs->kcr3 = kcr3; return 0; #else return -1; /* armed without the build flag: bootstrap impossible -> ERROR */ #endif } static void mc_kick_bootstrap(struct vmsig_adapter* a) { a->boot_count++; mc_req rq; memset(&rq, 0, sizeof rq); rq.op = MC_JOB_BOOTSTRAP; rq.boot_count = a->boot_count; (void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */ } /* ---- reg hooks (vmsig_memctx_reg.ctx = a; called by the core on the loop thread) ---- */ static void mc_reg_describe(void* ctx, vmsig_memctx* out_pod, const vmsig_memseg** out_segs, uint32_t* out_nseg) { struct vmsig_adapter* a = ctx; *out_pod = a->cur_pod; /* kcr3/low/nseg/flags; the core overwrites the epoch */ *out_segs = a->cur_segs; *out_nseg = a->cur_nseg; } static int mc_reg_share_fd(void* ctx) { struct vmsig_adapter* a = ctx; if (a->cfg_ro_fd >= 0) return fcntl(a->cfg_ro_fd, F_DUPFD_CLOEXEC, 0); /* infra-sealed RO-fd: dup */ if (a->stub) { if (a->stub_fd < 0) return -1; char path[64]; snprintf(path, sizeof path, "/proc/self/fd/%d", a->stub_fd); return open(path, O_RDONLY | O_CLOEXEC); /* fresh O_RDONLY on the backing */ } if (!a->ram_path) return -1; return open(a->ram_path, O_RDONLY | O_CLOEXEC); /* armed default */ } static void mc_reg_invalidate(void* ctx, uint32_t epoch) { struct vmsig_adapter* a = ctx; (void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */ a->have_ctx = 0; /* the previous context is invalid */ mc_kick_bootstrap(a); /* off-loop; on_ready re-emits MEMCTX (new epoch) */ } /* ---- vtable ---- */ static vmsig_adapter* mc_open(const void* cfg, uint32_t endpoint) { const vmsig_memctx_cfg* c = cfg; struct vmsig_adapter* a = calloc(1, sizeof *a); if (!a) return NULL; a->endpoint = endpoint; a->stub = c ? c->stub : 1; a->ram_path = c ? c->ram_path : NULL; a->low = c ? c->low : 0; a->cfg_ro_fd = (c && c->ro_fd >= 0) ? c->ro_fd : -1; if (!a->ram_path && a->cfg_ro_fd < 0) a->stub = 1; /* no path/fd => stub */ a->stub_fd = -1; return a; } static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg, int cap) { if (cap < 1) return -1; a->emit = *emit; a->worker = vmsig_worker_new(mc_job, a, 1, MC_WORKER_DEPTH); if (!a->worker) return -1; if (a->stub && a->cfg_ro_fd < 0) { a->stub_fd = mc_make_stub_fd(MC_STUB_SIZE); if (a->stub_fd < 0) { vmsig_worker_free(a->worker); a->worker = NULL; return -1; } } /* worker completion-eventfd as the readiness source (cookie=0). */ reg[0].fd = vmsig_worker_evfd(a->worker); reg[0].epoll_events = EPOLLIN; reg[0].shape = VMSIG_RDY_EVENTFD; reg[0].cookie = 0; /* register the reg BEFORE the first bootstrap: the core slot gets the hooks. describe * is not called until the slot is valid (which only happens after the first MEMCTX). */ if (a->emit.register_memctx) { vmsig_memctx_reg r; memset(&r, 0, sizeof r); r.endpoint = a->endpoint; r.source = VMSIG_SRC_MEMCTX; r.ctx = a; r.describe = mc_reg_describe; r.share_fd = mc_reg_share_fd; r.invalidate = mc_reg_invalidate; if (a->emit.register_memctx(a->emit.token, &r) == 0) a->registered = 1; } vmsig_event up; memset(&up, 0, sizeof up); up.kind = VMSIG_EV_SEAM_UP; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; a->emit.emit(a->emit.token, &up); mc_kick_bootstrap(a); /* first bootstrap off-loop; assemble the locator on completion */ return 1; } static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { (void)cookie; (void)events; vmsig_worker_ack(a->worker); mc_res rs; int rc; while (vmsig_worker_poll(a->worker, &rs, sizeof rs, &rc) == 1) { if (rs.op == MC_JOB_WRITE) { /* atomic write completed: addressed ACT_ACK to the initiator. */ mc_memwrite_ack(a, rs.ok && rc == 0, rs.corr, rs.origin); continue; } if (rc != 0) { /* bootstrap failed: ERROR (source MEMCTX); do NOT publish an invalid kcr3. */ vmsig_event er; memset(&er, 0, sizeof er); er.kind = VMSIG_EV_ERROR; er.source = VMSIG_SRC_MEMCTX; er.dir = VMSIG_DIR_UP; er.prio = VMSIG_PRIO_URGENT; er.endpoint = a->endpoint; a->emit.emit(a->emit.token, &er); continue; } /* assemble the locator on the loop thread from rs.kcr3. a->kcr3 is the gva_write * TARGET and is owned SOLELY by the worker thread (set in mc_bootstrap_armed, read by * MC_JOB_WRITE — same thread, FIFO happens-before); the loop must NOT also write it, or * an in-flight write at line ~170 would race it. cur_pod.kcr3 is loop-only (delivery). */ memset(&a->cur_pod, 0, sizeof a->cur_pod); a->cur_pod.kcr3 = rs.kcr3; a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE; a->cur_pod.flags = VMSIG_MEMCTX_RDONLY; a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */ a->cur_segs[0].gpa = 0; a->cur_segs[0].len = a->cur_pod.low; a->cur_segs[0].file_off = 0; a->cur_pod.nseg = a->cur_nseg; a->have_ctx = 1; /* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */ vmsig_event up; memset(&up, 0, sizeof up); up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod); a->emit.emit(a->emit.token, &up); } return 0; } /* Emit an addressed ACT_ACK for a MEMWRITE (source MEMCTX, to the initiator). inln carries * {ok,corr,origin} (same shape as the input adapter's ACK), so control reads ok at offset 0. * ok=0 covers extent-deny / no-SRC / queue-full / write failure (default-deny, observable). */ static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin) { struct { int ok; uint32_t corr; uint32_t origin; } body = { ok, corr, origin }; vmsig_event up; memset(&up, 0, sizeof up); up.kind = VMSIG_EV_ACT_ACK; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; up.corr = corr; up.origin = origin; up.payload.flags = VMSIG_PL_INLINE; memcpy(up.inln, &body, sizeof body); a->emit.emit(a->emit.token, &up); } /* DOWN MEMWRITE handler: validate extent, copy SRC off-loop, submit the atomic gva_write to * the worker. Default-deny: any invalid path (no SRC flag, len out of bounds, short payload, * queue full) ACKs ok=0 and does NOT actuate. The completion ACK for a queued write arrives * via mc_on_ready. Returns 0 when the event is consumed by this seam, 1 when it is not ours. */ static int mc_submit(vmsig_adapter* a, const vmsig_event* ev) { if (ev->kind != VMSIG_EV_CMD_MEMWRITE) return 1; /* not for this seam */ const vmsig_memwrite* mw = (const vmsig_memwrite*)ev->inln; uint32_t len = mw->len; if (len == 0 || len > VMSIG_MEMWRITE_MAX) { /* extent: bounded */ mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; } mc_req rq; memset(&rq, 0, sizeof rq); rq.op = MC_JOB_WRITE; rq.cr3 = mw->cr3; rq.gva = mw->gva; rq.len = len; rq.corr = ev->corr; rq.origin = ev->origin; /* copy SRC into the worker req (off-loop gva_write reads from rq.src). */ if (mw->flags & VMSIG_MW_SRC_INLINE) { if (len > VMSIG_MEMWRITE_INLINE) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; } memcpy(rq.src, ev->inln + sizeof *mw, len); /* inln tail after the 24-byte header */ } else if (mw->flags & VMSIG_MW_SRC_PAYLOAD) { if (!ev->payload.data || ev->payload.len < len) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; } memcpy(rq.src, ev->payload.data, len); /* in-proc borrowed payload */ } else { mc_memwrite_ack(a, 0, ev->corr, ev->origin); /* no SRC flag */ return 0; } if (vmsig_worker_submit(a->worker, &rq, sizeof rq) != 0) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); /* queue full -> ACK err */ return -1; } return 0; /* completion ACK arrives via mc_on_ready */ } static void mc_close(vmsig_adapter* a) { if (!a) return; if (a->registered && a->emit.unregister_memctx) a->emit.unregister_memctx(a->emit.token, a->endpoint); if (a->worker) vmsig_worker_free(a->worker); /* join: bootstrap + write jobs finished */ #ifdef VMSIG_WITH_VMIE if (a->win) vmie_win32_close(a->win); /* AFTER worker join: no in-flight gva_write */ #endif if (a->stub_fd >= 0) close(a->stub_fd); /* ro_fd ownership transferred to the adapter at open(): close it here so a re-grant * (detach + re-attach with a fresh infra ro_fd) does not leak the prior one. Infra * that must keep its own copy dups before handing it in — symmetric to the holder * side, which dups the borrowed RO-fd it receives. */ if (a->cfg_ro_fd >= 0) close(a->cfg_ro_fd); free(a); } static const vmsig_adapter_ops MC_OPS = { .name = "memctx", .source = VMSIG_SRC_MEMCTX, .codec = VMSIG_CODEC_MEMCTX, .open = mc_open, .attach = mc_attach, .on_readiness = mc_on_ready, .submit = mc_submit, .close = mc_close }; const vmsig_adapter_ops* vmsig_memctx_ops(void) { return &MC_OPS; }