Files
vatrog-vm-signaling/src/adapter/memctx/memctx.c
T
lirent 9bde398b6c vmsig: management daemon, runtime endpoint lifecycle, roster, discovery, in-tree drivers, packaging
- core: runtime attach/detach of a per-endpoint adapter trio (runtime-safe add_adapter + vmsig_core_detach_endpoint, deferred reap)
- roster: VMSIG_EV_ROSTER + CAP_ROSTER, retained per-endpoint and replayed to late subscribers
- discovery: inotify trigger dir, vmid/endpoint slot allocator, host probe; vmsigd daemon with config + per-uid admission
- input driver and vgpu perception built in-tree; vgpu perception as a separate library
- memctx: own the supplied ro_fd (closed at detach)
- deb packaging: install rules, systemd unit, tmpfiles, default config
2026-06-22 17:25:06 +03:00

416 lines
18 KiB
C

/* memctx.c — vmie sensor adapter: vends ONE coherent guest address-space context —
* the permanent System DirectoryTableBase (`kcr3`) PAIRED with a RAM-region locator
* and a pre-opened O_RDONLY fd. This is NOT perception and NOT semantics: signaling
* multicasts the datum + RO-fd, while the holder (an S-lib / any control) opens ITS OWN
* read-only vmie_mem from the fd and does gva_read/scan/pmap itself.
*
* Cold bring-up (host_bootstrap) is CPU-bound and blocking, so it runs on an off-loop
* worker; the loop thread only assembles the locator on the completion-eventfd and emits
* the MEMCTX trigger. The epoch is stamped by the CORE (retained-context); on an epoch
* change the core calls reg.invalidate, the adapter re-bootstraps and re-emits MEMCTX.
*
* RO outward is physical: O_RDONLY fd => mmap(PROT_WRITE) -> EACCES, so a write into the
* guest on the holder side is structurally impossible. stub mode (without VMSIG_WITH_VMIE
* or ram_path==NULL) synthesizes a kcr3 and a genuinely RO-mappable fd (memfd + seal) —
* the seam is provable without a VM. */
#define _GNU_SOURCE
#include "vmsig_adapter.h"
#include "memctx.h"
#include "adapter_util.h" /* vmsig_worker (off-loop bootstrap) */
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/epoll.h>
#ifdef VMSIG_WITH_VMIE
#include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */
#endif
/* memfd_create / seal — ABI fallbacks for old glibc/kernel (stub RO-fd backing). */
#ifndef MFD_CLOEXEC
#include <sys/syscall.h>
#include <linux/memfd.h>
static int memfd_create(const char* name, unsigned int flags) {
return (int)syscall(SYS_memfd_create, name, flags);
}
#endif
#ifndef MFD_ALLOW_SEALING
#define MFD_ALLOW_SEALING 0x0002U
#endif
#ifndef F_ADD_SEALS
#define F_ADD_SEALS (1024 + 9)
#define F_SEAL_SHRINK 0x0002
#define F_SEAL_GROW 0x0004
#endif
#ifndef F_SEAL_FUTURE_WRITE
#define F_SEAL_FUTURE_WRITE 0x0010 /* kernel 5.1+: forbid future writable mappings */
#endif
#define MC_STUB_SIZE 0x10000u /* 64 KB of synthetic RAM image (stub) */
#define MC_MAX_SEG 8
#define MC_WORKER_DEPTH 16 /* one off-loop thread: rare bootstrap + writes */
enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 };
/* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold
* bootstrap and the atomic writes (FIFO serializes a write against the close-on-rebootstrap).
* boot_count drives the stub kcr3 (changes per epoch); the real guest kcr3 does NOT depend
* on it (armed reads the System DTB). MC_JOB_WRITE copies SRC off-loop into req.src plus the
* target cr3 (0 => System DTB; resolved on the worker against a->kcr3). */
typedef struct {
uint32_t op; /* MC_JOB_* */
uint32_t boot_count; /* MC_JOB_BOOTSTRAP */
/* --- MC_JOB_WRITE --- */
uint64_t cr3; /* target AS root; 0 => a->kcr3 (kernel AS), resolved on worker */
uint64_t gva;
uint32_t len;
uint32_t corr;
uint32_t origin;
uint8_t src[VMSIG_MEMWRITE_MAX]; /* SRC bytes copied off-loop (gva_write reads this) */
} mc_req;
typedef struct {
uint32_t op; /* echoes the job type so on_ready demuxes */
int ok; /* MC_JOB_WRITE result */
uint32_t corr;
uint32_t origin;
uint64_t kcr3; /* MC_JOB_BOOTSTRAP result */
} mc_res;
struct vmsig_adapter {
uint32_t endpoint;
int stub;
const char* ram_path; /* armed: RAM-backing path (NOT published outward) */
uint64_t low;
int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (owned by adapter, closed in mc_close); <0 => default */
vmsig_emit emit;
int registered; /* register_memctx already called */
vmsig_worker* worker; /* off-loop bootstrap + atomic writes */
uint32_t boot_count; /* incremented on each (re-)bootstrap */
#ifdef VMSIG_WITH_VMIE
vmie_win32* win; /* held RW handle across the epoch (kcr3 source + gva_write target) */
vmie_mem* mem; /* vmie_win32_mem(win); borrowed, valid until vmie_win32_close */
#endif
uint64_t kcr3; /* current System DTB (also published in cur_pod.kcr3) */
/* persistent locator: owned by the loop thread; worker only yields kcr3 into scratch. */
int have_ctx;
vmsig_memctx cur_pod; /* kcr3/low/nseg/flags (epoch stamped by the core) */
vmsig_memseg cur_segs[MC_MAX_SEG];
uint32_t cur_nseg;
int stub_fd; /* stub: memfd of synth RAM (+seal); share_fd reopens it */
};
/* fwd: MEMWRITE completion ACK (defined below mc_submit; used in mc_on_ready demux). */
static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin);
/* ---- stub RO-fd: memfd + deterministic contents + seal of future writes ---- */
static int mc_make_stub_fd(uint32_t size) {
int fd = memfd_create("vmsig_memctx", MFD_CLOEXEC | MFD_ALLOW_SEALING);
if (fd < 0) fd = memfd_create("vmsig_memctx", MFD_CLOEXEC);
if (fd < 0) return -1;
if (ftruncate(fd, (off_t)size) != 0) { close(fd); return -1; }
/* deterministic contents via a temporary RW mapping BEFORE the seal */
uint8_t* p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (p != MAP_FAILED) {
for (uint32_t i = 0; i < size; i++) p[i] = (uint8_t)(i & 0xFFu);
munmap(p, size);
}
/* FUTURE_WRITE: even if the holder reopens the fd as O_RDWR, it gets no writable mapping.
* best-effort (kernel 5.1+); on older kernels only the O_RDONLY fd protects. */
if (fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_FUTURE_WRITE) != 0)
(void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW);
return fd;
}
#ifdef VMSIG_WITH_VMIE
/* armed bring-up: open RAM (RW is vmie's internal concern), host_bootstrap, extract the
* permanent System DTB as the System process cr3 (kcr3 — the root of the guest AS). The RW
* handle is HELD across the epoch (kcr3 source + gva_write target); ONLY the RO-fd (share_fd)
* leaves outward — write goes through this command plane, never a writable mmap. Runs on the
* off-loop worker; a stale handle from a prior epoch is dropped first (serialized FIFO with
* in-flight writes). */
static int mc_bootstrap_armed(struct vmsig_adapter* a, uint64_t* out_kcr3) {
if (a->win) { vmie_win32_close(a->win); a->win = NULL; a->mem = NULL; } /* drop stale epoch handle */
vmie_win32* v = vmie_win32_open(a->ram_path, a->low);
if (!v) return -1;
if (host_bootstrap(v) != 0) { vmie_win32_close(v); return -1; }
process procs[16];
int n = proc_list(v, 0, procs, 16);
uint64_t kcr3 = 0;
for (int i = 0; i < n && i < 16; i++)
if (!strcmp(procs[i].name, "System")) { kcr3 = procs[i].cr3; break; }
if (!kcr3) { vmie_win32_close(v); return -1; }
a->win = v; /* HOLD: RW handle lives across the epoch */
a->mem = vmie_win32_mem(v); /* borrowed; valid until vmie_win32_close(v) */
a->kcr3 = kcr3;
*out_kcr3 = kcr3;
return 0;
}
#endif
/* ---- worker job: cold bring-up OR atomic write, off-loop ----------------- *
* Demultiplexed by rq->op. BOTH run on the SAME single worker thread, so a write on the
* held handle never races the close-on-rebootstrap (FIFO). The job MUST NOT touch core
* structures — it only reads a->mem/a->kcr3 (stable between re-bootstraps on this thread). */
static int mc_job(void* user, const void* req, void* res) {
struct vmsig_adapter* a = user;
const mc_req* rq = req;
mc_res* rs = res;
memset(rs, 0, sizeof *rs);
rs->op = rq->op;
if (rq->op == MC_JOB_WRITE) {
rs->corr = rq->corr; rs->origin = rq->origin;
if (a->stub) { rs->ok = 1; return 0; } /* stub: ack without actuation */
#ifdef VMSIG_WITH_VMIE
/* a->mem is NULL until a bootstrap has succeeded (or after one failed and cleared it):
* the guard turns that into an ok=0 ACK (observable to the initiator), not a crash.
* cr3 resolve is on the worker (sole owner of a->kcr3): 0 => kernel AS (System DTB). */
uint64_t target = rq->cr3 ? rq->cr3 : a->kcr3;
rs->ok = (a->mem && gva_write(a->mem, (uintptr_t)target, (uintptr_t)rq->gva,
rq->src, rq->len) == 0);
return rs->ok ? 0 : -1;
#else
rs->ok = 0;
return -1; /* armed without the build flag: write impossible */
#endif
}
/* MC_JOB_BOOTSTRAP */
if (a->stub) {
rs->kcr3 = 0xC0DE0000ull + (uint64_t)rq->boot_count * 0x1000ull; /* changes per epoch */
return 0;
}
#ifdef VMSIG_WITH_VMIE
uint64_t kcr3 = 0;
if (mc_bootstrap_armed(a, &kcr3) != 0) return -1;
rs->kcr3 = kcr3;
return 0;
#else
return -1; /* armed without the build flag: bootstrap impossible -> ERROR */
#endif
}
static void mc_kick_bootstrap(struct vmsig_adapter* a) {
a->boot_count++;
mc_req rq;
memset(&rq, 0, sizeof rq);
rq.op = MC_JOB_BOOTSTRAP; rq.boot_count = a->boot_count;
(void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */
}
/* ---- reg hooks (vmsig_memctx_reg.ctx = a; called by the core on the loop thread) ---- */
static void mc_reg_describe(void* ctx, vmsig_memctx* out_pod,
const vmsig_memseg** out_segs, uint32_t* out_nseg) {
struct vmsig_adapter* a = ctx;
*out_pod = a->cur_pod; /* kcr3/low/nseg/flags; the core overwrites the epoch */
*out_segs = a->cur_segs;
*out_nseg = a->cur_nseg;
}
static int mc_reg_share_fd(void* ctx) {
struct vmsig_adapter* a = ctx;
if (a->cfg_ro_fd >= 0)
return fcntl(a->cfg_ro_fd, F_DUPFD_CLOEXEC, 0); /* infra-sealed RO-fd: dup */
if (a->stub) {
if (a->stub_fd < 0) return -1;
char path[64];
snprintf(path, sizeof path, "/proc/self/fd/%d", a->stub_fd);
return open(path, O_RDONLY | O_CLOEXEC); /* fresh O_RDONLY on the backing */
}
if (!a->ram_path) return -1;
return open(a->ram_path, O_RDONLY | O_CLOEXEC); /* armed default */
}
static void mc_reg_invalidate(void* ctx, uint32_t epoch) {
struct vmsig_adapter* a = ctx;
(void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */
a->have_ctx = 0; /* the previous context is invalid */
mc_kick_bootstrap(a); /* off-loop; on_ready re-emits MEMCTX (new epoch) */
}
/* ---- vtable ---- */
static vmsig_adapter* mc_open(const void* cfg, uint32_t endpoint) {
const vmsig_memctx_cfg* c = cfg;
struct vmsig_adapter* a = calloc(1, sizeof *a);
if (!a) return NULL;
a->endpoint = endpoint;
a->stub = c ? c->stub : 1;
a->ram_path = c ? c->ram_path : NULL;
a->low = c ? c->low : 0;
a->cfg_ro_fd = (c && c->ro_fd >= 0) ? c->ro_fd : -1;
if (!a->ram_path && a->cfg_ro_fd < 0) a->stub = 1; /* no path/fd => stub */
a->stub_fd = -1;
return a;
}
static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg, int cap) {
if (cap < 1) return -1;
a->emit = *emit;
a->worker = vmsig_worker_new(mc_job, a, 1, MC_WORKER_DEPTH);
if (!a->worker) return -1;
if (a->stub && a->cfg_ro_fd < 0) {
a->stub_fd = mc_make_stub_fd(MC_STUB_SIZE);
if (a->stub_fd < 0) { vmsig_worker_free(a->worker); a->worker = NULL; return -1; }
}
/* worker completion-eventfd as the readiness source (cookie=0). */
reg[0].fd = vmsig_worker_evfd(a->worker);
reg[0].epoll_events = EPOLLIN;
reg[0].shape = VMSIG_RDY_EVENTFD;
reg[0].cookie = 0;
/* register the reg BEFORE the first bootstrap: the core slot gets the hooks. describe
* is not called until the slot is valid (which only happens after the first MEMCTX). */
if (a->emit.register_memctx) {
vmsig_memctx_reg r;
memset(&r, 0, sizeof r);
r.endpoint = a->endpoint;
r.source = VMSIG_SRC_MEMCTX;
r.ctx = a;
r.describe = mc_reg_describe;
r.share_fd = mc_reg_share_fd;
r.invalidate = mc_reg_invalidate;
if (a->emit.register_memctx(a->emit.token, &r) == 0) a->registered = 1;
}
vmsig_event up;
memset(&up, 0, sizeof up);
up.kind = VMSIG_EV_SEAM_UP; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP;
up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint;
a->emit.emit(a->emit.token, &up);
mc_kick_bootstrap(a); /* first bootstrap off-loop; assemble the locator on completion */
return 1;
}
static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) {
(void)cookie; (void)events;
vmsig_worker_ack(a->worker);
mc_res rs;
int rc;
while (vmsig_worker_poll(a->worker, &rs, sizeof rs, &rc) == 1) {
if (rs.op == MC_JOB_WRITE) {
/* atomic write completed: addressed ACT_ACK to the initiator. */
mc_memwrite_ack(a, rs.ok && rc == 0, rs.corr, rs.origin);
continue;
}
if (rc != 0) {
/* bootstrap failed: ERROR (source MEMCTX); do NOT publish an invalid kcr3. */
vmsig_event er;
memset(&er, 0, sizeof er);
er.kind = VMSIG_EV_ERROR; er.source = VMSIG_SRC_MEMCTX; er.dir = VMSIG_DIR_UP;
er.prio = VMSIG_PRIO_URGENT; er.endpoint = a->endpoint;
a->emit.emit(a->emit.token, &er);
continue;
}
/* assemble the locator on the loop thread from rs.kcr3. a->kcr3 is the gva_write
* TARGET and is owned SOLELY by the worker thread (set in mc_bootstrap_armed, read by
* MC_JOB_WRITE — same thread, FIFO happens-before); the loop must NOT also write it, or
* an in-flight write at line ~170 would race it. cur_pod.kcr3 is loop-only (delivery). */
memset(&a->cur_pod, 0, sizeof a->cur_pod);
a->cur_pod.kcr3 = rs.kcr3;
a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE;
a->cur_pod.flags = VMSIG_MEMCTX_RDONLY;
a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */
a->cur_segs[0].gpa = 0;
a->cur_segs[0].len = a->cur_pod.low;
a->cur_segs[0].file_off = 0;
a->cur_pod.nseg = a->cur_nseg;
a->have_ctx = 1;
/* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */
vmsig_event up;
memset(&up, 0, sizeof up);
up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP;
up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint;
memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod);
a->emit.emit(a->emit.token, &up);
}
return 0;
}
/* Emit an addressed ACT_ACK for a MEMWRITE (source MEMCTX, to the initiator). inln carries
* {ok,corr,origin} (same shape as the input adapter's ACK), so control reads ok at offset 0.
* ok=0 covers extent-deny / no-SRC / queue-full / write failure (default-deny, observable). */
static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin) {
struct { int ok; uint32_t corr; uint32_t origin; } body = { ok, corr, origin };
vmsig_event up;
memset(&up, 0, sizeof up);
up.kind = VMSIG_EV_ACT_ACK; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP;
up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint;
up.corr = corr; up.origin = origin;
up.payload.flags = VMSIG_PL_INLINE;
memcpy(up.inln, &body, sizeof body);
a->emit.emit(a->emit.token, &up);
}
/* DOWN MEMWRITE handler: validate extent, copy SRC off-loop, submit the atomic gva_write to
* the worker. Default-deny: any invalid path (no SRC flag, len out of bounds, short payload,
* queue full) ACKs ok=0 and does NOT actuate. The completion ACK for a queued write arrives
* via mc_on_ready. Returns 0 when the event is consumed by this seam, 1 when it is not ours. */
static int mc_submit(vmsig_adapter* a, const vmsig_event* ev) {
if (ev->kind != VMSIG_EV_CMD_MEMWRITE) return 1; /* not for this seam */
const vmsig_memwrite* mw = (const vmsig_memwrite*)ev->inln;
uint32_t len = mw->len;
if (len == 0 || len > VMSIG_MEMWRITE_MAX) { /* extent: bounded */
mc_memwrite_ack(a, 0, ev->corr, ev->origin);
return 0;
}
mc_req rq; memset(&rq, 0, sizeof rq);
rq.op = MC_JOB_WRITE; rq.cr3 = mw->cr3; rq.gva = mw->gva; rq.len = len;
rq.corr = ev->corr; rq.origin = ev->origin;
/* copy SRC into the worker req (off-loop gva_write reads from rq.src). */
if (mw->flags & VMSIG_MW_SRC_INLINE) {
if (len > VMSIG_MEMWRITE_INLINE) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; }
memcpy(rq.src, ev->inln + sizeof *mw, len); /* inln tail after the 24-byte header */
} else if (mw->flags & VMSIG_MW_SRC_PAYLOAD) {
if (!ev->payload.data || ev->payload.len < len) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; }
memcpy(rq.src, ev->payload.data, len); /* in-proc borrowed payload */
} else {
mc_memwrite_ack(a, 0, ev->corr, ev->origin); /* no SRC flag */
return 0;
}
if (vmsig_worker_submit(a->worker, &rq, sizeof rq) != 0) {
mc_memwrite_ack(a, 0, ev->corr, ev->origin); /* queue full -> ACK err */
return -1;
}
return 0; /* completion ACK arrives via mc_on_ready */
}
static void mc_close(vmsig_adapter* a) {
if (!a) return;
if (a->registered && a->emit.unregister_memctx)
a->emit.unregister_memctx(a->emit.token, a->endpoint);
if (a->worker) vmsig_worker_free(a->worker); /* join: bootstrap + write jobs finished */
#ifdef VMSIG_WITH_VMIE
if (a->win) vmie_win32_close(a->win); /* AFTER worker join: no in-flight gva_write */
#endif
if (a->stub_fd >= 0) close(a->stub_fd);
/* ro_fd ownership transferred to the adapter at open(): close it here so a re-grant
* (detach + re-attach with a fresh infra ro_fd) does not leak the prior one. Infra
* that must keep its own copy dups before handing it in — symmetric to the holder
* side, which dups the borrowed RO-fd it receives. */
if (a->cfg_ro_fd >= 0) close(a->cfg_ro_fd);
free(a);
}
static const vmsig_adapter_ops MC_OPS = {
.name = "memctx", .source = VMSIG_SRC_MEMCTX, .codec = VMSIG_CODEC_MEMCTX,
.open = mc_open, .attach = mc_attach, .on_readiness = mc_on_ready,
.submit = mc_submit, .close = mc_close
};
const vmsig_adapter_ops* vmsig_memctx_ops(void) { return &MC_OPS; }