mirror of
https://dev.lirent.ru/Vatrog/vm-automation-signaling.git
synced 2026-06-20 19:06:37 +03:00
vmsig: a neutral signaling layer between sensors/input and controls
An epoll-driven, neutral transfer-event bus that connects sensors and input actuators to one or more controls, bidirectionally. It owns the transfer context and events — delivery order, priority, protocol-level timing, and an interrupt-driven event model over fd sources (eventfd/timerfd/sockets) — and stays agnostic to both the sensor/input drivers and the control. What lives here: - memctx: a coherent address-space context per endpoint — the guest address-space root paired with a pre-opened read-only RAM-region fd, with per-endpoint epoch invalidation and retained replay to late subscribers. Perception lives in out-of-tree sensor libraries that consume this datum read-only. - exclusive-ownership leases for destructive resource classes (input, power, memory-write). - write-signaled memory writes (MEMWRITE): an atomic write to guest memory routed through the seam under an exclusive lease, never a writable mapping. - a host-management seam for VM lifecycle/status and a neutral input-injection command path. - multi-VM endpoints; capability-gated, audited control authorization over an in-process or unix-socket transport. Builds against headers only by default (a stub mode that exercises the seam without a VM); armed builds link the real sensor/input libraries behind flags. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+224
@@ -0,0 +1,224 @@
|
||||
/* core.c — core lifecycle and registration of adapters/controls.
|
||||
* The loop and pumps live in loop.c. */
|
||||
#include "core_internal.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <sys/epoll.h>
|
||||
#include <sys/eventfd.h>
|
||||
|
||||
core_slot* core_register_fd(vmsig_core* c, int fd, uint32_t epoll_events, slot_role role) {
|
||||
/* reuse a detached (SLOT_DEAD) slot so c->slots[] does not grow on every
|
||||
* connection */
|
||||
core_slot* s = NULL;
|
||||
for (int i = 0; i < c->nslots; i++)
|
||||
if (c->slots[i]->role == SLOT_DEAD) { s = c->slots[i]; break; }
|
||||
|
||||
if (!s) {
|
||||
if (c->nslots == c->cap_slots) {
|
||||
int ncap = c->cap_slots ? c->cap_slots * 2 : 16;
|
||||
core_slot** ns = realloc(c->slots, (size_t)ncap * sizeof *ns);
|
||||
if (!ns) return NULL;
|
||||
c->slots = ns;
|
||||
c->cap_slots = ncap;
|
||||
}
|
||||
s = calloc(1, sizeof *s);
|
||||
if (!s) return NULL;
|
||||
c->slots[c->nslots++] = s;
|
||||
}
|
||||
|
||||
memset(s, 0, sizeof *s);
|
||||
s->role = role;
|
||||
s->fd = fd;
|
||||
|
||||
struct epoll_event ee;
|
||||
memset(&ee, 0, sizeof ee);
|
||||
ee.events = epoll_events;
|
||||
ee.data.ptr = s;
|
||||
if (epoll_ctl(c->epfd, EPOLL_CTL_ADD, fd, &ee) < 0) { s->role = SLOT_DEAD; return NULL; }
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
vmsig_core* vmsig_core_new(vmsig_ctx* ctx) {
|
||||
if (!ctx) return NULL;
|
||||
vmsig_core* c = calloc(1, sizeof *c);
|
||||
if (!c) return NULL;
|
||||
c->ctx = ctx;
|
||||
c->epfd = -1;
|
||||
c->wake_fd = -1;
|
||||
|
||||
c->epfd = epoll_create1(EPOLL_CLOEXEC);
|
||||
if (c->epfd < 0) { free(c); return NULL; }
|
||||
|
||||
c->wake_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
|
||||
if (c->wake_fd < 0) { close(c->epfd); free(c); return NULL; }
|
||||
if (!core_register_fd(c, c->wake_fd, EPOLLIN, SLOT_WAKEUP)) {
|
||||
close(c->wake_fd); close(c->epfd); free(c); return NULL;
|
||||
}
|
||||
|
||||
/* context pacing timerfds (created in ctx_new) as loop sources */
|
||||
for (int d = VMSIG_DIR_UP; d <= VMSIG_DIR_DOWN; d++) {
|
||||
int tfd = vmsig_ctx_timing_fd(ctx, (vmsig_dir)d);
|
||||
if (tfd >= 0) core_register_fd(c, tfd, EPOLLIN, SLOT_CTX_TIMING);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
int vmsig_core_add_adapter(vmsig_core* c, const vmsig_adapter_ops* ops,
|
||||
const void* cfg, uint32_t endpoint) {
|
||||
if (!c || !ops || c->nadapters >= VMSIG_MAX_ADAPTERS) return -1;
|
||||
|
||||
vmsig_adapter* a = ops->open(cfg, endpoint);
|
||||
if (!a) return -1;
|
||||
|
||||
vmsig_emit emit = { core_emit_up, core_register_memctx, core_unregister_memctx, c };
|
||||
vmsig_fd_reg reg[VMSIG_ADAPTER_FDS];
|
||||
memset(reg, 0, sizeof reg);
|
||||
|
||||
int n = ops->attach(a, &emit, reg, VMSIG_ADAPTER_FDS);
|
||||
if (n < 0) { ops->close(a); return -1; }
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
uint32_t events = reg[i].epoll_events ? reg[i].epoll_events : (uint32_t)EPOLLIN;
|
||||
core_slot* s = core_register_fd(c, reg[i].fd, events, SLOT_ADAPTER);
|
||||
if (!s) { ops->close(a); return -1; }
|
||||
s->ops = ops;
|
||||
s->adapter = a;
|
||||
s->cookie = reg[i].cookie;
|
||||
}
|
||||
|
||||
int id = c->nadapters;
|
||||
c->adapters[c->nadapters].ops = ops;
|
||||
c->adapters[c->nadapters].a = a;
|
||||
c->adapters[c->nadapters].endpoint = endpoint;
|
||||
c->nadapters++;
|
||||
return id;
|
||||
}
|
||||
|
||||
int vmsig_core_add_control(vmsig_core* c, const vmsig_control_ops* ops, void* ctl,
|
||||
const vmsig_grant* grant) {
|
||||
if (!c || !ops) return -1;
|
||||
|
||||
/* reuse a freed (reaped) slot; otherwise grow up to the ceiling */
|
||||
int id = -1;
|
||||
for (int i = 0; i < c->ncontrols; i++)
|
||||
if (!c->controls[i].active) { id = i; break; }
|
||||
if (id < 0) {
|
||||
if (c->ncontrols >= VMSIG_MAX_CONTROLS) return -1;
|
||||
id = c->ncontrols++;
|
||||
}
|
||||
core_control_ent* e = &c->controls[id];
|
||||
uint16_t gen = e->gen; /* generation survives the slot memset */
|
||||
memset(e, 0, sizeof *e);
|
||||
e->gen = (uint16_t)(gen + 1); /* new generation for this (re)use */
|
||||
e->ops = ops;
|
||||
e->ctl = ctl;
|
||||
e->active = 1;
|
||||
if (grant) e->grant = *grant; /* otherwise stays zero => default-deny */
|
||||
e->dctx.core = c;
|
||||
e->dctx.ctl_id = id;
|
||||
|
||||
if (ops->subscribe) ops->subscribe(ctl, &e->sub);
|
||||
/* emit_down token is our down_ctx, so emit_down can find this control's grant */
|
||||
if (ops->set_emit_down) ops->set_emit_down(ctl, core_emit_down, &e->dctx);
|
||||
|
||||
int fd = ops->fd ? ops->fd(ctl) : -1;
|
||||
if (fd >= 0) {
|
||||
core_slot* s = core_register_fd(c, fd, EPOLLIN, SLOT_CONTROL);
|
||||
if (!s) return -1;
|
||||
s->cops = ops;
|
||||
s->ctl = ctl;
|
||||
e->slot = s;
|
||||
}
|
||||
|
||||
/* Late subscriber: replay retained MEMCTX (if a context is already published and
|
||||
* this control is qualified). For a control added BEFORE the first publication,
|
||||
* the cell is not yet valid — it receives MEMCTX via the normal multicast in pump_up. */
|
||||
core_memctx_replay(c, id);
|
||||
|
||||
return id; /* ncontrols already bumped when picking id (on growth); reuse does not grow it */
|
||||
}
|
||||
|
||||
|
||||
/* ===== MEMCTX registration: per-endpoint retain cell (called by the adapter on the loop thread) =====
|
||||
* Registers the address-space context adapter's reg hooks. The core holds THIS and does
|
||||
* NOT store a copy of the locator: on delivery/replay it calls reg.describe/share_fd.
|
||||
* valid/epoch are maintained in route/epoch_bump (not here): register only records that
|
||||
* "the adapter is connected". */
|
||||
int core_register_memctx(void* token, const vmsig_memctx_reg* reg) {
|
||||
vmsig_core* c = token;
|
||||
if (!c || !reg || reg->endpoint >= 64) return -1;
|
||||
core_memctx_cell* cell = &c->memctx[reg->endpoint];
|
||||
cell->reg = *reg;
|
||||
cell->registered = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void core_unregister_memctx(void* token, uint32_t endpoint) {
|
||||
vmsig_core* c = token;
|
||||
if (!c || endpoint >= 64) return;
|
||||
core_memctx_cell* cell = &c->memctx[endpoint];
|
||||
cell->registered = 0;
|
||||
cell->valid = 0;
|
||||
memset(&cell->reg, 0, sizeof cell->reg);
|
||||
}
|
||||
|
||||
void vmsig_core_set_audit(vmsig_core* c, void (*cb)(void* ud, const vmsig_audit* a), void* ud) {
|
||||
if (!c) return;
|
||||
c->audit_cb = cb;
|
||||
c->audit_ud = ud;
|
||||
}
|
||||
|
||||
void core_audit(vmsig_core* c, const vmsig_audit* a) {
|
||||
if (c && c->audit_cb) c->audit_cb(c->audit_ud, a);
|
||||
}
|
||||
|
||||
void vmsig_core_set_arb_policy(vmsig_core* c, vmsig_arb_policy cb, void* ud) {
|
||||
if (!c) return;
|
||||
c->arb_cb = cb;
|
||||
c->arb_ud = ud;
|
||||
/* lease[][] is zeroed in vmsig_core_new (calloc) => all cells free. */
|
||||
}
|
||||
|
||||
int core_add_source(vmsig_core* c, int fd, void (*cb)(void* user, uint32_t events),
|
||||
void* user, void (*on_free)(void* user)) {
|
||||
if (!c || fd < 0 || !cb) return -1;
|
||||
core_slot* s = core_register_fd(c, fd, EPOLLIN, SLOT_SOURCE);
|
||||
if (!s) return -1;
|
||||
s->on_source = cb;
|
||||
s->on_free = on_free;
|
||||
s->source_user = user;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void core_request_drop(vmsig_core* c, int ctl_id) {
|
||||
if (!c || ctl_id < 0 || ctl_id >= c->ncontrols) return;
|
||||
c->controls[ctl_id].reap = 1;
|
||||
core_wake(c); /* wake the loop for a reap pass (without stop) */
|
||||
}
|
||||
|
||||
void vmsig_core_free(vmsig_core* c) {
|
||||
if (!c) return;
|
||||
/* graceful: stop workers and close SI handles / sockets. Adapters are closed
|
||||
* FIRST: their close stops off-loop workers and unregisters their seams (e.g.
|
||||
* memctx) BEFORE destruction. */
|
||||
for (int i = 0; i < c->nadapters; i++)
|
||||
if (c->adapters[i].ops->close) c->adapters[i].ops->close(c->adapters[i].a);
|
||||
for (int i = 0; i < c->ncontrols; i++)
|
||||
if (c->controls[i].active && c->controls[i].ops->close)
|
||||
c->controls[i].ops->close(c->controls[i].ctl);
|
||||
|
||||
/* cleanup of fd sources (e.g. unix listener: close listen/janitor fd + free) */
|
||||
for (int i = 0; i < c->nslots; i++)
|
||||
if (c->slots[i]->role == SLOT_SOURCE && c->slots[i]->on_free)
|
||||
c->slots[i]->on_free(c->slots[i]->source_user);
|
||||
|
||||
for (int i = 0; i < c->nslots; i++) free(c->slots[i]);
|
||||
free(c->slots);
|
||||
if (c->wake_fd >= 0) close(c->wake_fd);
|
||||
if (c->epfd >= 0) close(c->epfd);
|
||||
/* ctx is not ours: its owner frees it */
|
||||
free(c);
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
#ifndef VMSIG_CORE_INTERNAL_H
|
||||
#define VMSIG_CORE_INTERNAL_H
|
||||
#include "vmsig_core.h"
|
||||
#include <signal.h>
|
||||
|
||||
/* Private internals of the epoll core. Each registered fd carries a
|
||||
* core_slot* in epoll_event.data.ptr; the slot's role decides how to handle it. */
|
||||
|
||||
#define VMSIG_MAX_EVENTS 64
|
||||
#define VMSIG_MAX_ADAPTERS 256 /* up to ~64 VMs * 3 adapters + slack (mode A) */
|
||||
#define VMSIG_MAX_CONTROLS 64 /* concurrent pollers; more => processes (C) */
|
||||
#define VMSIG_ADAPTER_FDS 8 /* max fds per adapter */
|
||||
#define VMSIG_DOWN_PENDING_MAX 256 /* ceiling of DOWN commands per poller in ctx (fairness) */
|
||||
|
||||
typedef enum {
|
||||
SLOT_WAKEUP, /* wake/stop eventfd */
|
||||
SLOT_ADAPTER, /* adapter fd (timerfd/eventfd/socket) */
|
||||
SLOT_CTX_TIMING, /* context pacing timerfd */
|
||||
SLOT_CONTROL, /* out-of-process control socket */
|
||||
SLOT_SOURCE, /* arbitrary fd + callback (e.g. listen-fd) */
|
||||
SLOT_DEAD /* detached (reaped); loop ignores it */
|
||||
} slot_role;
|
||||
|
||||
typedef struct core_slot {
|
||||
slot_role role;
|
||||
int fd;
|
||||
/* for SLOT_ADAPTER */
|
||||
const vmsig_adapter_ops* ops;
|
||||
vmsig_adapter* adapter;
|
||||
uint32_t cookie;
|
||||
/* for SLOT_CONTROL */
|
||||
const vmsig_control_ops* cops;
|
||||
void* ctl;
|
||||
/* for SLOT_SOURCE */
|
||||
void (*on_source)(void* user, uint32_t events);
|
||||
void (*on_free)(void* user); /* invoked at core_free (source cleanup) */
|
||||
void* source_user;
|
||||
} core_slot;
|
||||
|
||||
typedef struct {
|
||||
const vmsig_adapter_ops* ops;
|
||||
vmsig_adapter* a;
|
||||
uint32_t endpoint;
|
||||
} core_adapter_ent;
|
||||
|
||||
|
||||
/* ===== Retained address-space context (MEMCTX seam) =====
|
||||
* The core retains per-endpoint "a current context exists in the current epoch" + the
|
||||
* adapter's reg pointer (describe/share_fd/invalidate). Replays to a late qualified
|
||||
* subscriber (CAP_MEMCTX + source_mask + endpoint) re-sharing the RO-fd. Does NOT store a
|
||||
* copy of the locator: on delivery/replay it calls reg.describe (adapter snapshot) +
|
||||
* reg.share_fd (fresh RO-fd). Invalidated on epoch change; cleared on unregister/free. */
|
||||
typedef struct {
|
||||
int registered; /* adapter called register_memctx (reg valid) */
|
||||
int valid; /* a published context exists in the current epoch */
|
||||
uint32_t epoch; /* snapshot epoch (== core epoch[ep] when valid) */
|
||||
vmsig_memctx_reg reg; /* valid when registered */
|
||||
} core_memctx_cell;
|
||||
|
||||
/* ===== Lease layer (arbitration of exclusive ownership of destructive resources) =====
|
||||
* One cell per (endpoint, lease-class): who owns it (origin) + a snapshot of arb_prio at
|
||||
* acquisition time. owner=0 => free. The snapshot (rather than the live grant) makes the
|
||||
* policy resilient to the owner's grant changing after acquisition. */
|
||||
#define VMSIG_LEASE_CLASSES 3 /* INPUT, POWER, MEMWRITE (== VMSIG_LEASE_CLASS_MAX) */
|
||||
typedef struct {
|
||||
uint32_t owner; /* origin (gen<<16)|(id+1) of the owner; 0 = free */
|
||||
uint32_t owner_prio; /* owner's arb_prio at acquisition time (snapshot) */
|
||||
} core_lease_cell;
|
||||
|
||||
struct vmsig_core; /* fwd for core_down_ctx */
|
||||
|
||||
/* DOWN emission context: handed to a control in set_emit_down so emit_down knows WHICH
|
||||
* control issued the command (for grant lookup and enforcement). Stable: lives in the
|
||||
* fixed controls[] array. */
|
||||
typedef struct {
|
||||
struct vmsig_core* core;
|
||||
int ctl_id;
|
||||
} core_down_ctx;
|
||||
|
||||
typedef struct {
|
||||
const vmsig_control_ops* ops;
|
||||
void* ctl;
|
||||
vmsig_sub sub;
|
||||
vmsig_grant grant; /* poller's rights ceiling (default-deny) */
|
||||
core_down_ctx dctx; /* token for emit_down */
|
||||
int active; /* 0 = detached/reaped (slot free) */
|
||||
int reap; /* reap requested (deferred) */
|
||||
core_slot* slot; /* SLOT_CONTROL fd slot (or NULL) */
|
||||
uint32_t pending; /* DOWN commands of this poller in ctx (fairness cap) */
|
||||
uint16_t gen; /* slot generation: +1 on each (re)use */
|
||||
} core_control_ent;
|
||||
|
||||
struct vmsig_core {
|
||||
int epfd;
|
||||
int wake_fd; /* eventfd: nudge + stop */
|
||||
vmsig_ctx* ctx;
|
||||
volatile sig_atomic_t stopping;
|
||||
|
||||
core_adapter_ent adapters[VMSIG_MAX_ADAPTERS];
|
||||
int nadapters;
|
||||
core_control_ent controls[VMSIG_MAX_CONTROLS];
|
||||
int ncontrols;
|
||||
|
||||
core_slot** slots; /* all allocated slots (for free) */
|
||||
int nslots;
|
||||
int cap_slots;
|
||||
|
||||
|
||||
uint32_t epoch[64]; /* per-endpoint VM session epoch */
|
||||
core_memctx_cell memctx[64]; /* per-endpoint retained context */
|
||||
|
||||
core_lease_cell lease[64][VMSIG_LEASE_CLASSES]; /* lease per (endpoint, class) */
|
||||
vmsig_arb_policy arb_cb; /* preemption policy (NULL=default) */
|
||||
void* arb_ud;
|
||||
|
||||
void (*audit_cb)(void* ud, const vmsig_audit* a);
|
||||
void* audit_ud;
|
||||
};
|
||||
|
||||
/* Emit an audit record (no-op if no callback is set). Defined in core.c. */
|
||||
void core_audit(vmsig_core* c, const vmsig_audit* a);
|
||||
|
||||
/* Register an fd in epoll + create a slot (see core.c). */
|
||||
core_slot* core_register_fd(vmsig_core* c, int fd, uint32_t epoll_events, slot_role role);
|
||||
|
||||
/* Register an arbitrary fd source with a callback (e.g. a socket listen-fd).
|
||||
* The callback is called on the loop thread when the fd is ready. on_free (may be NULL)
|
||||
* is called at vmsig_core_free to clean up the source's resource. 0/-1. */
|
||||
int core_add_source(vmsig_core* c, int fd, void (*cb)(void* user, uint32_t events),
|
||||
void* user, void (*on_free)(void* user));
|
||||
|
||||
/* Request detaching a control by id (deferred reap after the batch: epoll DEL,
|
||||
* close fd, ops->close). Safe to call from the control's own on_readable. */
|
||||
void core_request_drop(vmsig_core* c, int ctl_id);
|
||||
|
||||
/* emit hooks handed to adapters (UP) and controls (DOWN). Defined in loop.c. */
|
||||
int core_emit_up (void* token, vmsig_event* ev);
|
||||
int core_emit_down(void* token, vmsig_event* ev);
|
||||
|
||||
/* ===== Address-space context (MEMCTX seam; retained context) ===== */
|
||||
/* Context registration hooks (handed to the adapter in vmsig_emit; defined in core.c). */
|
||||
int core_register_memctx (void* token, const vmsig_memctx_reg* reg);
|
||||
void core_unregister_memctx(void* token, uint32_t endpoint);
|
||||
|
||||
/* Multicast MEMCTX to qualified subscribers + mark the retain cell valid
|
||||
* (from pump_up on the VMSIG_EV_MEMCTX trigger; defined in loop.c). */
|
||||
void core_memctx_route(vmsig_core* c, const vmsig_event* trigger);
|
||||
|
||||
/* Replay retained MEMCTX to a single (late) subscriber (from vmsig_core_add_control;
|
||||
* defined in loop.c). */
|
||||
void core_memctx_replay(vmsig_core* c, int ctl_id);
|
||||
|
||||
/* Bump the endpoint's epoch on a destructive lifecycle transition: epoch++, invalidate
|
||||
* the retain cell, emit MEMCTX_INVALIDATED, request re-bootstrap from the adapter.
|
||||
* Observed by the core in pump_up on UP VM_LIFECYCLE (defined in loop.c). */
|
||||
void core_epoch_bump(vmsig_core* c, uint32_t endpoint);
|
||||
|
||||
/* ===== Lease layer (defined in loop.c) ===== */
|
||||
/* Intercept CMD_ACQUIRE/RELEASE/LEASE_STATUS (synchronously from core_emit_down, not in ctx). */
|
||||
void core_lease_acquire(vmsig_core* c, int ctl_id, const vmsig_event* ev);
|
||||
void core_lease_release(vmsig_core* c, int ctl_id, const vmsig_event* ev);
|
||||
void core_lease_status (vmsig_core* c, int ctl_id, const vmsig_event* ev);
|
||||
|
||||
/* Reclaim the lease of a dead control (from core_reap, BEFORE e->active=0). */
|
||||
void core_lease_reap_control(vmsig_core* c, int ctl_id);
|
||||
|
||||
/* Wake the loop (eventfd nudge). Defined in loop.c. */
|
||||
void core_wake(vmsig_core* c);
|
||||
|
||||
#endif /* VMSIG_CORE_INTERNAL_H */
|
||||
@@ -0,0 +1,620 @@
|
||||
/* loop.c — non-blocking epoll loop, dispatch, pump up/down, emit hooks,
|
||||
* graceful shutdown. No sleep/polling/busy-wait: every wakeup is an fd. */
|
||||
#include "core_internal.h"
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <sys/epoll.h>
|
||||
|
||||
static void drain_counter_fd(int fd) {
|
||||
uint64_t v;
|
||||
while (read(fd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain */ }
|
||||
}
|
||||
|
||||
void core_wake(vmsig_core* c) {
|
||||
uint64_t one = 1;
|
||||
ssize_t r = write(c->wake_fd, &one, sizeof one);
|
||||
(void)r;
|
||||
}
|
||||
|
||||
int core_emit_up(void* token, vmsig_event* ev) {
|
||||
vmsig_core* c = token;
|
||||
int r = vmsig_ctx_submit(c->ctx, VMSIG_DIR_UP, ev);
|
||||
core_wake(c); /* nudge in case of emission off the loop thread */
|
||||
return r;
|
||||
}
|
||||
|
||||
/* origin = (gen<<16)|(id+1): low 16 bits are the control's id+1, high bits the slot
|
||||
* generation. Lets a reply be addressed to the initiator and stale reuse filtered out. */
|
||||
static uint32_t origin_pack(int id, uint16_t gen) {
|
||||
return ((uint32_t)gen << 16) | ((uint32_t)(id + 1) & 0xFFFFu);
|
||||
}
|
||||
/* Live control by origin with generation check; NULL if gone/slot reused. */
|
||||
static core_control_ent* origin_ctl(vmsig_core* c, uint32_t origin) {
|
||||
if (!origin) return NULL;
|
||||
int id = (int)(origin & 0xFFFFu) - 1;
|
||||
uint16_t gen = (uint16_t)(origin >> 16);
|
||||
if (id < 0 || id >= c->ncontrols) return NULL;
|
||||
core_control_ent* e = &c->controls[id];
|
||||
if (!e->active || e->gen != gen) return NULL;
|
||||
return e;
|
||||
}
|
||||
|
||||
/* Capability for a DOWN command (unknown => deny). Destructive CMD_LIFECYCLE
|
||||
* (powerdown/reset, code in inln[0]) requires CAP_POWER, safe ones CAP_LIFECYCLE. */
|
||||
static uint32_t cap_for_down(const vmsig_event* ev) {
|
||||
switch (ev->kind) {
|
||||
case VMSIG_EV_CMD_INPUT:
|
||||
case VMSIG_EV_CMD_QUERY_INPUT: return VMSIG_CAP_INPUT; /* injection / held-key query */
|
||||
case VMSIG_EV_CMD_LIFECYCLE:
|
||||
return (ev->inln[0] == VMSIG_LIFE_POWERDOWN || ev->inln[0] == VMSIG_LIFE_RESET)
|
||||
? VMSIG_CAP_POWER : VMSIG_CAP_LIFECYCLE;
|
||||
case VMSIG_EV_CMD_VM: /* op in inln[0] (vmsig_vm_cmd, op<256) */
|
||||
return (ev->inln[0] == VMSIG_VMOP_RESET || ev->inln[0] == VMSIG_VMOP_POWERDOWN ||
|
||||
ev->inln[0] == VMSIG_VMOP_QUIT) ? VMSIG_CAP_POWER : VMSIG_CAP_VM;
|
||||
case VMSIG_EV_CMD_MEMWRITE: return VMSIG_CAP_MEMWRITE; /* atomic guest-memory write */
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
/* ===== Lease layer: classification and helpers ===== */
|
||||
|
||||
/* Lease class for a DOWN command. MIRRORS cap_for_down by destructiveness:
|
||||
* - CMD_INPUT -> INPUT;
|
||||
* - CMD_LIFECYCLE powerdown/reset -> POWER;
|
||||
* - CMD_VM reset/powerdown/quit -> POWER;
|
||||
* - everything else (safe/read-only/stream/query) -> -1 (not lease-gated).
|
||||
* CMD_LIFECYCLE and CMD_VM route to DIFFERENT adapters (INPUT/VMHOST) but share ONE
|
||||
* POWER class per endpoint: a single owner of VM destruction (intentional). */
|
||||
static int lease_class_for_down(const vmsig_event* ev) {
|
||||
switch (ev->kind) {
|
||||
case VMSIG_EV_CMD_INPUT:
|
||||
return VMSIG_LEASE_INPUT;
|
||||
case VMSIG_EV_CMD_LIFECYCLE:
|
||||
return (ev->inln[0] == VMSIG_LIFE_POWERDOWN || ev->inln[0] == VMSIG_LIFE_RESET)
|
||||
? VMSIG_LEASE_POWER : -1;
|
||||
case VMSIG_EV_CMD_VM:
|
||||
return (ev->inln[0] == VMSIG_VMOP_RESET || ev->inln[0] == VMSIG_VMOP_POWERDOWN ||
|
||||
ev->inln[0] == VMSIG_VMOP_QUIT) ? VMSIG_LEASE_POWER : -1;
|
||||
case VMSIG_EV_CMD_MEMWRITE:
|
||||
return VMSIG_LEASE_MEMWRITE; /* always destructive (write to shared guest memory) */
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
/* Cap required to lease a class (probing/holding a class without the cap is forbidden). */
|
||||
static uint32_t cap_for_lease_class(int cls) {
|
||||
return cls == VMSIG_LEASE_INPUT ? VMSIG_CAP_INPUT
|
||||
: cls == VMSIG_LEASE_POWER ? VMSIG_CAP_POWER
|
||||
: cls == VMSIG_LEASE_MEMWRITE ? VMSIG_CAP_MEMWRITE : 0u;
|
||||
}
|
||||
/* Source bitmask permitted to hold a lease class: mirrors the grant's source ceiling
|
||||
* (which grant_allows_down enforces on the command itself). Leasing is intercepted
|
||||
* BEFORE grant_allows_down, so source is checked HERE — otherwise a principal without
|
||||
* the required seam could hold someone else's cell (DoS), bypassing source_mask.
|
||||
* INPUT -> SRC_INPUT; POWER -> SRC_INPUT (lifecycle) OR SRC_VMHOST (vm) — one
|
||||
* destructive path suffices; MEMWRITE -> SRC_MEMCTX (lives on the MEMCTX seam). */
|
||||
static uint32_t source_mask_for_lease_class(int cls) {
|
||||
return cls == VMSIG_LEASE_INPUT ? (1u << VMSIG_SRC_INPUT)
|
||||
: cls == VMSIG_LEASE_POWER ? ((1u << VMSIG_SRC_INPUT) | (1u << VMSIG_SRC_VMHOST))
|
||||
: cls == VMSIG_LEASE_MEMWRITE ? (1u << VMSIG_SRC_MEMCTX) : 0u;
|
||||
}
|
||||
|
||||
/* Capability to receive an UP event: address-space context (MEMCTX/MEMCTX_INVALIDATED)
|
||||
* -> CAP_MEMCTX; cursor is screen data, available to a GUI observer (OBSERVE) OR an
|
||||
* input actor (INPUT); otherwise CAP_OBSERVE (frames/SEAM/generic). The grant_allows_up
|
||||
* gate checks intersection, so OBSERVE|INPUT means "either of the two". */
|
||||
static uint32_t cap_for_up(const vmsig_event* ev) {
|
||||
if (ev->kind == VMSIG_EV_CURSOR_STATE) return VMSIG_CAP_OBSERVE | VMSIG_CAP_INPUT;
|
||||
return (ev->source == VMSIG_SRC_MEMCTX) ? VMSIG_CAP_MEMCTX : VMSIG_CAP_OBSERVE;
|
||||
}
|
||||
static int grant_allows_down(const vmsig_grant* g, const vmsig_event* ev) {
|
||||
if (ev->endpoint >= 64) return 0; /* 64-bit mask: <=64 VMs/cores */
|
||||
if (!(g->endpoint_mask & (1ull << ev->endpoint))) return 0;
|
||||
if (!(g->source_mask & (1u << ev->source))) return 0; /* source ceiling on DOWN too */
|
||||
uint32_t need = cap_for_down(ev);
|
||||
return need && (g->cap_mask & need);
|
||||
}
|
||||
static int grant_allows_up(const vmsig_grant* g, const vmsig_event* ev) {
|
||||
if (ev->endpoint >= 64) return 0;
|
||||
if (!(g->cap_mask & cap_for_up(ev))) return 0;
|
||||
if (!(g->endpoint_mask & (1ull << ev->endpoint))) return 0;
|
||||
if (!(g->source_mask & (1u << ev->source))) return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Find an adapter by (endpoint, source). NULL if none. Used by pump_down to route a
|
||||
* DOWN command to its adapter. */
|
||||
static core_adapter_ent* core_find_adapter(vmsig_core* c, uint32_t endpoint,
|
||||
vmsig_source source) {
|
||||
for (int i = 0; i < c->nadapters; i++) {
|
||||
core_adapter_ent* e = &c->adapters[i];
|
||||
if (e->ops->source == source && e->endpoint == endpoint) return e;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ===== Lease layer: grant/release/status/finalization/reclaim =====
|
||||
* Intercepted in core_emit_down BEFORE grant_allows_down (synchronous, not in ctx, does
|
||||
* not touch pending). Addressed UP replies to the initiator via core_emit_up
|
||||
* (origin+generation). */
|
||||
|
||||
/* Addressed UP reply to the initiator of a lease request. */
|
||||
static void lease_reply(vmsig_core* c, const vmsig_event* req, vmsig_kind kind,
|
||||
uint32_t cls, uint32_t reason) {
|
||||
vmsig_event up;
|
||||
memset(&up, 0, sizeof up);
|
||||
up.kind = kind; up.source = VMSIG_SRC_CORE; up.dir = VMSIG_DIR_UP;
|
||||
up.prio = VMSIG_PRIO_URGENT; up.endpoint = req->endpoint; up.origin = req->origin;
|
||||
vmsig_lease_req lr = { cls, reason };
|
||||
memcpy(up.inln, &lr, sizeof lr);
|
||||
core_emit_up(c, &up);
|
||||
}
|
||||
|
||||
/* Lease denial: audit (visibility of authorization/contention denials — capability/
|
||||
* endpoint enumeration via ACQUIRE is observable) + addressed LEASE_DENIED to initiator. */
|
||||
static void lease_deny(vmsig_core* c, const vmsig_event* req, uint32_t principal,
|
||||
uint32_t cls, uint32_t reason) {
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_DENIED, principal, req->endpoint, cls, reason };
|
||||
core_audit(c, &a);
|
||||
lease_reply(c, req, VMSIG_EV_LEASE_DENIED, cls, reason);
|
||||
}
|
||||
|
||||
/* Principal of the cell owner (for STATUS); 0 if owner is dead/absent. */
|
||||
static uint32_t lease_owner_principal(vmsig_core* c, uint32_t owner) {
|
||||
core_control_ent* e = origin_ctl(c, owner);
|
||||
return e ? e->grant.principal : 0u;
|
||||
}
|
||||
|
||||
/* IMPORTANT (layer isolation): signaling does NOT release held keys on lease loss and
|
||||
* does NOT track held state at all. held is the ACTUATOR's record (vmctl); release is the
|
||||
* control's decision. On owner change/reset the cell is simply freed; stuck keys remain
|
||||
* the control's concern (it can issue CMD_QUERY_INPUT and release its own while owner). */
|
||||
|
||||
void core_lease_acquire(vmsig_core* c, int ctl_id, const vmsig_event* ev) {
|
||||
core_control_ent* e = &c->controls[ctl_id];
|
||||
uint32_t cls = ((const vmsig_lease_req*)ev->inln)->cls;
|
||||
uint32_t ep = ev->endpoint;
|
||||
|
||||
/* 1. validate class/endpoint/grant (default-deny; every denial is audited). */
|
||||
if (cls >= VMSIG_LEASE_CLASS_MAX) {
|
||||
lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_BADCLASS);
|
||||
return;
|
||||
}
|
||||
if (ep >= 64 || !(e->grant.endpoint_mask & (1ull << ep))) {
|
||||
lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_NOGRANT);
|
||||
return;
|
||||
}
|
||||
if (!(e->grant.cap_mask & cap_for_lease_class((int)cls))) {
|
||||
lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_NOCAP);
|
||||
return;
|
||||
}
|
||||
/* source ceiling: holding a class without rights to its seam is forbidden (else a
|
||||
* DoS hold of someone else's cell bypassing source_mask, since interception is
|
||||
* BEFORE grant_allows_down). */
|
||||
if (!(e->grant.source_mask & source_mask_for_lease_class((int)cls))) {
|
||||
lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_NOGRANT);
|
||||
return;
|
||||
}
|
||||
|
||||
core_lease_cell* cell = &c->lease[ep][cls];
|
||||
uint32_t me = ev->origin;
|
||||
|
||||
/* 2a. free OR dead owner (origin_ctl==NULL) => take as if free. */
|
||||
core_control_ent* owner_e = cell->owner ? origin_ctl(c, cell->owner) : NULL;
|
||||
if (cell->owner == 0 || !owner_e) {
|
||||
cell->owner = me; cell->owner_prio = e->grant.arb_prio;
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_GRANTED, e->grant.principal, ep, cls, 0 };
|
||||
core_audit(c, &a);
|
||||
lease_reply(c, ev, VMSIG_EV_LEASE_GRANTED, cls, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
/* 2b. owner is the caller itself => idempotent GRANTED. */
|
||||
if (cell->owner == me) {
|
||||
lease_reply(c, ev, VMSIG_EV_LEASE_GRANTED, cls, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
/* 2c. held by a LIVE other owner => policy. incumbent is the live grant. */
|
||||
vmsig_arb_decision dec;
|
||||
if (c->arb_cb) {
|
||||
dec = c->arb_cb(c->arb_ud, ep, cls, &owner_e->grant, &e->grant);
|
||||
} else {
|
||||
dec = (e->grant.arb_prio > cell->owner_prio) ? VMSIG_ARB_PREEMPT : VMSIG_ARB_DENY;
|
||||
}
|
||||
if (dec != VMSIG_ARB_PREEMPT) {
|
||||
/* equal priority => owner keeps it (HELD); strictly lower => LOWER_PRIO. */
|
||||
uint32_t reason = (e->grant.arb_prio < cell->owner_prio)
|
||||
? VMSIG_LEASE_DENY_LOWER_PRIO : VMSIG_LEASE_DENY_HELD;
|
||||
lease_deny(c, ev, e->grant.principal, cls, reason);
|
||||
return;
|
||||
}
|
||||
|
||||
/* PREEMPT: notify the old owner (REVOKED), switch owner, grant to the new one.
|
||||
* signaling does NOT release held keys (that is the control's decision): the
|
||||
* ex-owner is responsible for its stuck keys; the new owner can query held
|
||||
* (CMD_QUERY_INPUT) and release them. */
|
||||
uint32_t old_owner = cell->owner;
|
||||
{
|
||||
vmsig_event rv;
|
||||
memset(&rv, 0, sizeof rv);
|
||||
rv.endpoint = ep; rv.origin = old_owner;
|
||||
lease_reply(c, &rv, VMSIG_EV_LEASE_REVOKED, cls, 0);
|
||||
}
|
||||
{
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_REVOKED, owner_e->grant.principal, ep, cls, 0 };
|
||||
core_audit(c, &a);
|
||||
}
|
||||
cell->owner = me; cell->owner_prio = e->grant.arb_prio;
|
||||
{
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_GRANTED, e->grant.principal, ep, cls, 0 };
|
||||
core_audit(c, &a);
|
||||
}
|
||||
lease_reply(c, ev, VMSIG_EV_LEASE_GRANTED, cls, 0);
|
||||
}
|
||||
|
||||
void core_lease_release(vmsig_core* c, int ctl_id, const vmsig_event* ev) {
|
||||
core_control_ent* e = &c->controls[ctl_id];
|
||||
uint32_t cls = ((const vmsig_lease_req*)ev->inln)->cls;
|
||||
uint32_t ep = ev->endpoint;
|
||||
|
||||
/* cross-endpoint isolation + cap/source gate BEFORE any action (like acquire). */
|
||||
if (cls >= VMSIG_LEASE_CLASS_MAX || ep >= 64) return;
|
||||
if (!(e->grant.endpoint_mask & (1ull << ep))) return;
|
||||
if (!(e->grant.cap_mask & cap_for_lease_class((int)cls))) return;
|
||||
if (!(e->grant.source_mask & source_mask_for_lease_class((int)cls))) return;
|
||||
|
||||
core_lease_cell* cell = &c->lease[ep][cls];
|
||||
if (cell->owner != ev->origin) return; /* not owner => no-op */
|
||||
|
||||
/* signaling does NOT release held keys — that is the control's decision (it releases
|
||||
* its own keys before release if needed). Here we only free the cell. */
|
||||
cell->owner = 0; cell->owner_prio = 0;
|
||||
lease_reply(c, ev, VMSIG_EV_LEASE_RELEASED, cls, 0);
|
||||
}
|
||||
|
||||
void core_lease_status(vmsig_core* c, int ctl_id, const vmsig_event* ev) {
|
||||
core_control_ent* e = &c->controls[ctl_id];
|
||||
uint32_t cls = ((const vmsig_lease_req*)ev->inln)->cls;
|
||||
uint32_t ep = ev->endpoint;
|
||||
|
||||
/* busy-state can be probed only within one's own endpoint and with the class cap
|
||||
* (else a principal without CAP_INPUT/CAP_POWER would leak busy-state/other principal). */
|
||||
if (cls >= VMSIG_LEASE_CLASS_MAX || ep >= 64) return;
|
||||
if (!(e->grant.endpoint_mask & (1ull << ep))) return;
|
||||
if (!(e->grant.cap_mask & cap_for_lease_class((int)cls))) return;
|
||||
if (!(e->grant.source_mask & source_mask_for_lease_class((int)cls))) return;
|
||||
|
||||
core_lease_cell* cell = &c->lease[ep][cls];
|
||||
uint32_t busy = (cell->owner && origin_ctl(c, cell->owner)) ? 1u : 0u;
|
||||
|
||||
vmsig_event up;
|
||||
memset(&up, 0, sizeof up);
|
||||
up.kind = VMSIG_EV_LEASE_STATUS; up.source = VMSIG_SRC_CORE; up.dir = VMSIG_DIR_UP;
|
||||
up.prio = VMSIG_PRIO_URGENT; up.endpoint = ep; up.origin = ev->origin;
|
||||
vmsig_lease_status st = { cls, busy, busy ? lease_owner_principal(c, cell->owner) : 0u };
|
||||
memcpy(up.inln, &st, sizeof st);
|
||||
core_emit_up(c, &up);
|
||||
}
|
||||
|
||||
void core_lease_reap_control(vmsig_core* c, int ctl_id) {
|
||||
/* Clear all cells owned by this (still live) slot, BEFORE active=0.
|
||||
* origin is compared by the slot's current gen (active, gen valid at reap time). */
|
||||
uint32_t owner = origin_pack(ctl_id, c->controls[ctl_id].gen);
|
||||
for (uint32_t ep = 0; ep < 64; ep++) {
|
||||
for (int cls = 0; cls < VMSIG_LEASE_CLASSES; cls++) {
|
||||
core_lease_cell* cell = &c->lease[ep][cls];
|
||||
if (cell->owner != owner) continue;
|
||||
/* only free the cell; the dead owner's held keys are NOT our concern (vmctl's
|
||||
* record; the next owner sees them via CMD_QUERY_INPUT and decides itself). */
|
||||
cell->owner = 0; cell->owner_prio = 0;
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_RECLAIMED,
|
||||
c->controls[ctl_id].grant.principal, ep, (uint32_t)cls, 0 };
|
||||
core_audit(c, &a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* DOWN emit from a control: enforcement against THIS control's grant. */
|
||||
int core_emit_down(void* token, vmsig_event* ev) {
|
||||
core_down_ctx* d = token;
|
||||
vmsig_core* c = d->core;
|
||||
core_control_ent* e = &c->controls[d->ctl_id];
|
||||
if (!e->active) { vmsig_payload_release(ev); return -1; }
|
||||
|
||||
/* Lease arbitration is intercepted HERE (synchronous, not in ctx, does not touch
|
||||
* pending). origin is needed for the addressed reply and as the owner key. */
|
||||
if (ev->kind == VMSIG_EV_CMD_ACQUIRE || ev->kind == VMSIG_EV_CMD_RELEASE ||
|
||||
ev->kind == VMSIG_EV_CMD_LEASE_STATUS) {
|
||||
ev->origin = origin_pack(d->ctl_id, e->gen);
|
||||
if (ev->kind == VMSIG_EV_CMD_ACQUIRE) core_lease_acquire(c, d->ctl_id, ev);
|
||||
else if (ev->kind == VMSIG_EV_CMD_RELEASE) core_lease_release(c, d->ctl_id, ev);
|
||||
else core_lease_status(c, d->ctl_id, ev);
|
||||
vmsig_payload_release(ev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!grant_allows_down(&e->grant, ev)) {
|
||||
vmsig_audit a = { VMSIG_AUDIT_DOWN_DENIED, e->grant.principal,
|
||||
ev->endpoint, (uint32_t)ev->kind, 0 };
|
||||
core_audit(c, &a); /* rejected by policy (endpoint/source/class) */
|
||||
vmsig_payload_release(ev);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Lease GATE: destruction is passed ONLY by the class's current owner.
|
||||
* A non-owner (or an owner whose slot is dead) => drop + audit LEASE_DENIED
|
||||
* (distinguishable from grant-deny). A free cell => also drop: destruction cannot be
|
||||
* used without an explicit lease. Safe/read-only commands (cls<0) are not gated. */
|
||||
{
|
||||
int cls = lease_class_for_down(ev);
|
||||
if (cls >= 0 && ev->endpoint < 64) {
|
||||
uint32_t me = origin_pack(d->ctl_id, e->gen);
|
||||
uint32_t owner = c->lease[ev->endpoint][cls].owner;
|
||||
if (owner != me || !origin_ctl(c, owner)) {
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_DENIED, e->grant.principal,
|
||||
ev->endpoint, (uint32_t)ev->kind, 0 };
|
||||
core_audit(c, &a);
|
||||
vmsig_payload_release(ev);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (e->pending >= VMSIG_DOWN_PENDING_MAX) { /* fairness/DoS: DOWN cap per poller */
|
||||
vmsig_audit a = { VMSIG_AUDIT_DOWN_DENIED, e->grant.principal,
|
||||
ev->endpoint, (uint32_t)ev->kind, 0 };
|
||||
core_audit(c, &a);
|
||||
vmsig_payload_release(ev);
|
||||
return -1;
|
||||
}
|
||||
ev->origin = origin_pack(d->ctl_id, e->gen); /* addressed reply + pending accounting */
|
||||
e->pending++;
|
||||
int r = vmsig_ctx_submit(c->ctx, VMSIG_DIR_DOWN, ev);
|
||||
if (r != 0) e->pending--; /* not enqueued (drop/err) */
|
||||
core_wake(c);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int sub_match(const vmsig_sub* sub, const vmsig_event* ev) {
|
||||
if (sub->source_mask && !(sub->source_mask & (1u << ev->source))) return 0;
|
||||
if (ev->prio < sub->prio_min) return 0;
|
||||
if (sub->endpoint_mask) {
|
||||
if (ev->endpoint >= 64 || !(sub->endpoint_mask & (1ull << ev->endpoint))) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* ===== Address-space context (MEMCTX seam): multicast / retain-replay / epoch =====
|
||||
* The core vends ONE coherent datum per-endpoint: kcr3+locator paired with an RO-fd. A
|
||||
* MEMCTX trigger from the adapter => the core builds the AUTHORITATIVE locator from the
|
||||
* adapter snapshot (reg.describe) + stamps the epoch (single source of truth) and
|
||||
* distributes to qualified subscribers with re-sharing of the RO-fd. The same path serves
|
||||
* replay to a late subscriber. */
|
||||
|
||||
/* Build a MEMCTX delivery event for endpoint ep. segs are borrowed from the adapter's
|
||||
* buffer (delivery is synchronous on the loop thread; ownership is not transferred).
|
||||
* 1 — built. */
|
||||
static int core_memctx_build(vmsig_core* c, uint32_t ep, vmsig_event* ev) {
|
||||
if (ep >= 64) return 0;
|
||||
core_memctx_cell* cell = &c->memctx[ep];
|
||||
if (!cell->registered || !cell->reg.describe) return 0;
|
||||
|
||||
vmsig_memctx pod;
|
||||
memset(&pod, 0, sizeof pod);
|
||||
const vmsig_memseg* segs = NULL;
|
||||
uint32_t nseg = 0;
|
||||
cell->reg.describe(cell->reg.ctx, &pod, &segs, &nseg);
|
||||
pod.epoch = c->epoch[ep]; /* core stamps the epoch */
|
||||
pod.nseg = nseg;
|
||||
pod.flags |= VMSIG_MEMCTX_RDONLY; /* outward — always read-only */
|
||||
|
||||
memset(ev, 0, sizeof *ev);
|
||||
ev->kind = VMSIG_EV_MEMCTX; ev->source = VMSIG_SRC_MEMCTX; ev->dir = VMSIG_DIR_UP;
|
||||
ev->prio = VMSIG_PRIO_NORMAL; ev->endpoint = ep;
|
||||
memcpy(ev->inln, &pod, sizeof pod);
|
||||
ev->payload.data = (void*)segs; /* borrowed: owner is the adapter */
|
||||
ev->payload.len = (size_t)nseg * sizeof(vmsig_memseg);
|
||||
ev->payload.codec = VMSIG_CODEC_MEMCTX;
|
||||
ev->payload.flags = VMSIG_PL_BORROWED;
|
||||
ev->payload.release = NULL;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Deliver MEMCTX to one qualified control: fresh RO-fd from reg.share_fd
|
||||
* (socket -> cmsg, in-proc -> direct int), attach_memctx, close fd (the core does not own
|
||||
* the fd). On success — audit MEMCTX_GRANTED. */
|
||||
static void core_memctx_deliver_one(vmsig_core* c, core_memctx_cell* cell,
|
||||
core_control_ent* e, const vmsig_event* ev) {
|
||||
if (!e->ops->attach_memctx) return; /* control does not accept MEMCTX */
|
||||
int fd = cell->reg.share_fd ? cell->reg.share_fd(cell->reg.ctx) : -1;
|
||||
int r = e->ops->attach_memctx(e->ctl, ev, fd);
|
||||
if (fd >= 0) close(fd); /* the core does not own the ro-fd */
|
||||
if (r == 0) {
|
||||
vmsig_audit a = { VMSIG_AUDIT_MEMCTX_GRANTED, e->grant.principal,
|
||||
ev->endpoint, 0, 0 };
|
||||
core_audit(c, &a);
|
||||
}
|
||||
}
|
||||
|
||||
void core_memctx_route(vmsig_core* c, const vmsig_event* trigger) {
|
||||
uint32_t ep = trigger->endpoint;
|
||||
if (ep >= 64) return;
|
||||
core_memctx_cell* cell = &c->memctx[ep];
|
||||
if (!cell->registered) return;
|
||||
|
||||
vmsig_event ev;
|
||||
if (!core_memctx_build(c, ep, &ev)) return;
|
||||
|
||||
cell->valid = 1; /* epoch context published */
|
||||
cell->epoch = c->epoch[ep];
|
||||
|
||||
for (int i = 0; i < c->ncontrols; i++) {
|
||||
core_control_ent* e = &c->controls[i];
|
||||
if (!e->active) continue;
|
||||
if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev))
|
||||
core_memctx_deliver_one(c, cell, e, &ev);
|
||||
}
|
||||
}
|
||||
|
||||
void core_memctx_replay(vmsig_core* c, int ctl_id) {
|
||||
if (ctl_id < 0 || ctl_id >= c->ncontrols) return;
|
||||
core_control_ent* e = &c->controls[ctl_id];
|
||||
if (!e->active) return;
|
||||
for (uint32_t ep = 0; ep < 64; ep++) {
|
||||
core_memctx_cell* cell = &c->memctx[ep];
|
||||
if (!cell->registered || !cell->valid) continue;
|
||||
vmsig_event ev;
|
||||
if (!core_memctx_build(c, ep, &ev)) continue;
|
||||
if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev))
|
||||
core_memctx_deliver_one(c, cell, e, &ev);
|
||||
}
|
||||
}
|
||||
|
||||
void core_epoch_bump(vmsig_core* c, uint32_t endpoint) {
|
||||
if (endpoint >= 64) return;
|
||||
c->epoch[endpoint]++;
|
||||
core_memctx_cell* cell = &c->memctx[endpoint];
|
||||
cell->valid = 0; /* prior-epoch context is not replayed */
|
||||
|
||||
vmsig_event up;
|
||||
memset(&up, 0, sizeof up);
|
||||
up.kind = VMSIG_EV_MEMCTX_INVALIDATED; up.source = VMSIG_SRC_MEMCTX;
|
||||
up.dir = VMSIG_DIR_UP; up.prio = VMSIG_PRIO_URGENT; up.endpoint = endpoint;
|
||||
vmsig_memctx_inv inv = { endpoint, c->epoch[endpoint] };
|
||||
memcpy(up.inln, &inv, sizeof inv);
|
||||
core_emit_up(c, &up); /* broadcast to holders (CAP_MEMCTX gate) */
|
||||
|
||||
/* request re-bootstrap from the adapter: it re-emits MEMCTX{epoch+1} when ready. */
|
||||
if (cell->registered && cell->reg.invalidate)
|
||||
cell->reg.invalidate(cell->reg.ctx, c->epoch[endpoint]);
|
||||
}
|
||||
|
||||
/* UP: drain the context queue and dispatch to subscribed controls */
|
||||
static void pump_up(vmsig_core* c) {
|
||||
vmsig_event ev;
|
||||
while (vmsig_ctx_next(c->ctx, VMSIG_DIR_UP, &ev) == 1) {
|
||||
if (ev.kind == VMSIG_EV_MEMCTX) {
|
||||
/* Context trigger: the core builds the authoritative locator (adapter snapshot
|
||||
* + epoch stamp) and distributes to qualified holders with re-sharing of the
|
||||
* RO-fd. The trigger itself is NOT delivered as an ordinary event. */
|
||||
core_memctx_route(c, &ev);
|
||||
vmsig_payload_release(&ev); /* inline trigger (release=NULL) — harmless */
|
||||
continue;
|
||||
}
|
||||
if (ev.kind == VMSIG_EV_VM_LIFECYCLE && ev.origin == 0) {
|
||||
/* Epoch-transition observation: a destructive async transition (VMHOST
|
||||
* broadcast) invalidates the address-space context. NOT continue — VM_LIFECYCLE
|
||||
* still goes to subscribers below via the normal broadcast. */
|
||||
const vmsig_vm_state* vs = (const vmsig_vm_state*)ev.inln;
|
||||
if (vs->state == VMSIG_VM_RESET || vs->state == VMSIG_VM_POWERDOWN ||
|
||||
vs->state == VMSIG_VM_SHUTDOWN)
|
||||
core_epoch_bump(c, ev.endpoint);
|
||||
}
|
||||
if (ev.origin) {
|
||||
/* addressed reply ONLY to the initiator (origin+generation). The command was
|
||||
* already authorized by the grant => we deliver the reply without re-check; if
|
||||
* the initiator is gone/slot reused — we drop (private data, not broadcast). */
|
||||
core_control_ent* e = origin_ctl(c, ev.origin);
|
||||
if (e && e->ops->deliver) e->ops->deliver(e->ctl, &ev);
|
||||
} else {
|
||||
/* unaddressed event — broadcast; effective = grant ∩ sub */
|
||||
for (int i = 0; i < c->ncontrols; i++) {
|
||||
core_control_ent* e = &c->controls[i];
|
||||
if (!e->active) continue;
|
||||
if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev) && e->ops->deliver)
|
||||
e->ops->deliver(e->ctl, &ev);
|
||||
}
|
||||
}
|
||||
vmsig_payload_release(&ev);
|
||||
}
|
||||
}
|
||||
|
||||
/* DOWN: drain the queue and route the command to the adapter (source+endpoint) */
|
||||
static void pump_down(vmsig_core* c) {
|
||||
vmsig_event ev;
|
||||
while (vmsig_ctx_next(c->ctx, VMSIG_DIR_DOWN, &ev) == 1) {
|
||||
core_control_ent* oe = origin_ctl(c, ev.origin); /* command has left ctx */
|
||||
if (oe && oe->pending) oe->pending--; /* THE ONLY decrement */
|
||||
|
||||
/* In-flight fencing: destruction whose origin is NO LONGER the class owner (lease
|
||||
* lost between the emit_down gate and dequeue) is dropped BEFORE actuation. Does
|
||||
* NOT finalize (finalization is done by acquire/reap) — else a double key-up.
|
||||
* pending is NOT touched here (already decremented above). */
|
||||
int cls = lease_class_for_down(&ev);
|
||||
if (cls >= 0 && ev.endpoint < 64 && c->lease[ev.endpoint][cls].owner != ev.origin) {
|
||||
/* dropping destruction that lost the lease is observable (origin owner's principal). */
|
||||
vmsig_audit a = { VMSIG_AUDIT_LEASE_DENIED, lease_owner_principal(c, ev.origin),
|
||||
ev.endpoint, (uint32_t)ev.kind, (uint32_t)cls };
|
||||
core_audit(c, &a);
|
||||
vmsig_payload_release(&ev);
|
||||
continue;
|
||||
}
|
||||
|
||||
core_adapter_ent* e = core_find_adapter(c, ev.endpoint, ev.source);
|
||||
if (e && e->ops->submit) e->ops->submit(e->a, &ev);
|
||||
vmsig_payload_release(&ev);
|
||||
}
|
||||
}
|
||||
|
||||
/* Deferred reap of detached controls: after the batch (safe — not inside their own
|
||||
* on_readable). epoll DEL + mark slot dead + ops->close. */
|
||||
static void core_reap(vmsig_core* c) {
|
||||
for (int i = 0; i < c->ncontrols; i++) {
|
||||
core_control_ent* e = &c->controls[i];
|
||||
if (!e->reap || !e->active) continue;
|
||||
if (e->slot) {
|
||||
epoll_ctl(c->epfd, EPOLL_CTL_DEL, e->slot->fd, NULL);
|
||||
e->slot->role = SLOT_DEAD;
|
||||
}
|
||||
core_lease_reap_control(c, i); /* return leases + finalize held BEFORE active=0 */
|
||||
if (e->ops->close) e->ops->close(e->ctl);
|
||||
e->active = 0;
|
||||
e->reap = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int vmsig_core_run(vmsig_core* c) {
|
||||
if (!c) return -1;
|
||||
struct epoll_event evs[VMSIG_MAX_EVENTS];
|
||||
while (!__atomic_load_n(&c->stopping, __ATOMIC_ACQUIRE)) {
|
||||
int n = epoll_wait(c->epfd, evs, VMSIG_MAX_EVENTS, -1);
|
||||
if (n < 0) { if (errno == EINTR) continue; return -1; }
|
||||
for (int i = 0; i < n; i++) {
|
||||
core_slot* s = (core_slot*)evs[i].data.ptr;
|
||||
switch (s->role) {
|
||||
case SLOT_WAKEUP:
|
||||
drain_counter_fd(s->fd); /* stopping is checked in while */
|
||||
break;
|
||||
case SLOT_ADAPTER:
|
||||
if (s->ops->on_readiness)
|
||||
s->ops->on_readiness(s->adapter, s->cookie, evs[i].events);
|
||||
break;
|
||||
case SLOT_CTX_TIMING:
|
||||
drain_counter_fd(s->fd);
|
||||
break;
|
||||
case SLOT_CONTROL:
|
||||
if (s->cops->on_readable)
|
||||
s->cops->on_readable(s->ctl);
|
||||
break;
|
||||
case SLOT_SOURCE:
|
||||
if (s->on_source)
|
||||
s->on_source(s->source_user, evs[i].events);
|
||||
break;
|
||||
case SLOT_DEAD:
|
||||
break; /* detached — ignore */
|
||||
}
|
||||
}
|
||||
pump_up(c);
|
||||
pump_down(c);
|
||||
core_reap(c);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void vmsig_core_stop(vmsig_core* c) {
|
||||
if (!c) return;
|
||||
__atomic_store_n(&c->stopping, 1, __ATOMIC_RELEASE); /* cross-thread stop signal */
|
||||
core_wake(c);
|
||||
}
|
||||
Reference in New Issue
Block a user