From 709f4b586aea74f6db109da7914ce20da463b4ab Mon Sep 17 00:00:00 2001 From: Gregory Lirent Date: Sat, 20 Jun 2026 18:46:31 +0300 Subject: [PATCH] vmsig: a neutral signaling layer between sensors/input and controls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An epoll-driven, neutral transfer-event bus that connects sensors and input actuators to one or more controls, bidirectionally. It owns the transfer context and events — delivery order, priority, protocol-level timing, and an interrupt-driven event model over fd sources (eventfd/timerfd/sockets) — and stays agnostic to both the sensor/input drivers and the control. What lives here: - memctx: a coherent address-space context per endpoint — the guest address-space root paired with a pre-opened read-only RAM-region fd, with per-endpoint epoch invalidation and retained replay to late subscribers. Perception lives in out-of-tree sensor libraries that consume this datum read-only. - exclusive-ownership leases for destructive resource classes (input, power, memory-write). - write-signaled memory writes (MEMWRITE): an atomic write to guest memory routed through the seam under an exclusive lease, never a writable mapping. - a host-management seam for VM lifecycle/status and a neutral input-injection command path. - multi-VM endpoints; capability-gated, audited control authorization over an in-process or unix-socket transport. Builds against headers only by default (a stub mode that exercises the seam without a VM); armed builds link the real sensor/input libraries behind flags. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 5 + CMakeLists.txt | 137 ++++++ include/vmsig.h | 21 + include/vmsig_adapter.h | 81 ++++ include/vmsig_control.h | 126 ++++++ include/vmsig_core.h | 70 ++++ include/vmsig_ctx.h | 48 +++ include/vmsig_event.h | 280 +++++++++++++ include/vmsig_memctx.h | 101 +++++ include/vmsig_socket.h | 42 ++ src/adapter/include/adapter_util.h | 44 ++ src/adapter/input/include/input.h | 18 + src/adapter/input/input.c | 230 +++++++++++ src/adapter/linux/worker.c | 162 ++++++++ src/adapter/memctx/include/memctx.h | 20 + src/adapter/memctx/memctx.c | 407 ++++++++++++++++++ src/adapter/vmhost/include/vmhost.h | 13 + src/adapter/vmhost/vmhost.c | 313 ++++++++++++++ src/cli.c | 182 ++++++++ src/control/inproc.c | 57 +++ src/control/socket.c | 318 ++++++++++++++ src/core/core.c | 224 ++++++++++ src/core/include/core_internal.h | 170 ++++++++ src/core/linux/loop.c | 620 ++++++++++++++++++++++++++++ src/ctx/ctx.c | 203 +++++++++ src/ctx/include/ctx_internal.h | 41 ++ src/test/test_authz.c | 95 +++++ src/test/test_ctx.c | 125 ++++++ src/test/test_inputobs.c | 93 +++++ src/test/test_lease.c | 525 +++++++++++++++++++++++ src/test/test_memctx.c | 339 +++++++++++++++ src/test/test_memwrite.c | 227 ++++++++++ src/test/test_mvm.c | 62 +++ src/test/test_sec.c | 121 ++++++ src/test/test_sock.c | 154 +++++++ src/test/test_vmhost.c | 146 +++++++ 36 files changed, 5820 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 include/vmsig.h create mode 100644 include/vmsig_adapter.h create mode 100644 include/vmsig_control.h create mode 100644 include/vmsig_core.h create mode 100644 include/vmsig_ctx.h create mode 100644 include/vmsig_event.h create mode 100644 include/vmsig_memctx.h create mode 100644 include/vmsig_socket.h create mode 100644 src/adapter/include/adapter_util.h create mode 100644 src/adapter/input/include/input.h create mode 100644 src/adapter/input/input.c create mode 100644 src/adapter/linux/worker.c create mode 100644 src/adapter/memctx/include/memctx.h create mode 100644 src/adapter/memctx/memctx.c create mode 100644 src/adapter/vmhost/include/vmhost.h create mode 100644 src/adapter/vmhost/vmhost.c create mode 100644 src/cli.c create mode 100644 src/control/inproc.c create mode 100644 src/control/socket.c create mode 100644 src/core/core.c create mode 100644 src/core/include/core_internal.h create mode 100644 src/core/linux/loop.c create mode 100644 src/ctx/ctx.c create mode 100644 src/ctx/include/ctx_internal.h create mode 100644 src/test/test_authz.c create mode 100644 src/test/test_ctx.c create mode 100644 src/test/test_inputobs.c create mode 100644 src/test/test_lease.c create mode 100644 src/test/test_memctx.c create mode 100644 src/test/test_memwrite.c create mode 100644 src/test/test_mvm.c create mode 100644 src/test/test_sec.c create mode 100644 src/test/test_sock.c create mode 100644 src/test/test_vmhost.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..77cc897 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.*/ +cmake-*/ +compile* +Testing/ +CLAUDE.md diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..82ad6cf --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,137 @@ +cmake_minimum_required(VERSION 3.16) +project(vmsig C) + +set(CMAKE_C_STANDARD 17) +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_C_EXTENSIONS ON) # epoll/eventfd/timerfd/clock_gettime: gnu ext +option(VMSIG_LTO "Enable LTO" OFF) + +# Link the real sibling libraries (their .a, built with -fPIC). By default the spine +# builds against headers only: the SI calls are hidden behind these flags, and the +# stub mode proves the seam without a real VM. +option(VMSIG_WITH_VMIE "Link real vmie (libvmie.a, PIC) for armed memctx" OFF) +option(VMSIG_WITH_VMCTL "Link real vmctl (libvmctl.a, PIC) for armed input" OFF) + +# ---- Sibling library sources (set these to your local checkouts) ------------ +# Only needed for the armed builds below; the default stub build needs neither. +set(LIBVMIE_PATH "" CACHE PATH "Path to the vmie library sources (for VMSIG_WITH_VMIE)") +set(LIBVMCTL_PATH "" CACHE PATH "Path to the vmctl library sources (for VMSIG_WITH_VMCTL)") + +find_package(Threads REQUIRED) + +# ---- signaling library ------------------------------------------------------ +add_library(vmsig SHARED + src/core/core.c + src/core/linux/loop.c + src/ctx/ctx.c + src/adapter/linux/worker.c + src/adapter/memctx/memctx.c + src/adapter/input/input.c + src/adapter/vmhost/vmhost.c + src/control/inproc.c + src/control/socket.c) + +target_include_directories(vmsig + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/ctx/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/memctx/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/input/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/vmhost/include) + +target_link_libraries(vmsig PRIVATE Threads::Threads) + +# armed: the real vmie as a pre-built PIC .a (IMPORTED) — headers and symbols. +if(VMSIG_WITH_VMIE) + add_library(vmie STATIC IMPORTED) + set_target_properties(vmie PROPERTIES + IMPORTED_LOCATION ${LIBVMIE_PATH}/.build/libvmie.a + INTERFACE_INCLUDE_DIRECTORIES ${LIBVMIE_PATH}/include) + target_link_libraries(vmsig PRIVATE vmie) + target_compile_definitions(vmsig PRIVATE VMSIG_WITH_VMIE) +endif() + +# armed: the real vmctl as a pre-built PIC .a (IMPORTED). +if(VMSIG_WITH_VMCTL) + add_library(vmctl STATIC IMPORTED) + set_target_properties(vmctl PROPERTIES + IMPORTED_LOCATION ${LIBVMCTL_PATH}/.build/libvmctl.a + INTERFACE_INCLUDE_DIRECTORIES ${LIBVMCTL_PATH}/include) + target_link_libraries(vmsig PRIVATE vmctl) + target_compile_definitions(vmsig PRIVATE VMSIG_WITH_VMCTL) +endif() + +target_compile_options(vmsig PRIVATE -O2 -Wall -Wextra) +if(VMSIG_LTO) + target_compile_options(vmsig PRIVATE -flto) + target_link_options(vmsig PRIVATE -flto) +endif() + +# ---- demonstrator on top of the library (like vmie_cli / vmctl) ------------- +add_executable(vmsig_cli src/cli.c) +target_link_libraries(vmsig_cli PRIVATE vmsig) +target_compile_options(vmsig_cli PRIVATE -Wall -Wextra) + +# ---- transfer-context tests (ctest) ----------------------------------------- +enable_testing() +add_executable(vmsig_test src/test/test_ctx.c) +target_link_libraries(vmsig_test PRIVATE vmsig) +target_compile_options(vmsig_test PRIVATE -Wall -Wextra) +add_test(NAME ctx COMMAND vmsig_test) + +add_executable(vmsig_sectest src/test/test_sec.c) +target_link_libraries(vmsig_sectest PRIVATE vmsig) +target_compile_options(vmsig_sectest PRIVATE -Wall -Wextra) +add_test(NAME sec COMMAND vmsig_sectest) + +add_executable(vmsig_socktest src/test/test_sock.c) +target_link_libraries(vmsig_socktest PRIVATE vmsig Threads::Threads) +target_compile_options(vmsig_socktest PRIVATE -Wall -Wextra) +add_test(NAME sock COMMAND vmsig_socktest) + +add_executable(vmsig_mvmtest src/test/test_mvm.c) +target_link_libraries(vmsig_mvmtest PRIVATE vmsig) +target_compile_options(vmsig_mvmtest PRIVATE -Wall -Wextra) +add_test(NAME mvm COMMAND vmsig_mvmtest) + +add_executable(vmsig_authztest src/test/test_authz.c) +target_link_libraries(vmsig_authztest PRIVATE vmsig) +target_compile_options(vmsig_authztest PRIVATE -Wall -Wextra) +add_test(NAME authz COMMAND vmsig_authztest) + +add_executable(vmsig_memctxtest src/test/test_memctx.c) +target_link_libraries(vmsig_memctxtest PRIVATE vmsig Threads::Threads) +target_include_directories(vmsig_memctxtest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include) +target_compile_options(vmsig_memctxtest PRIVATE -Wall -Wextra) +add_test(NAME memctx COMMAND vmsig_memctxtest) + +add_executable(vmsig_vmhosttest src/test/test_vmhost.c) +target_link_libraries(vmsig_vmhosttest PRIVATE vmsig Threads::Threads) +target_include_directories(vmsig_vmhosttest PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/vmhost/include) +target_compile_options(vmsig_vmhosttest PRIVATE -Wall -Wextra) +add_test(NAME vmhost COMMAND vmsig_vmhosttest) + +add_executable(vmsig_leasetest src/test/test_lease.c) +target_link_libraries(vmsig_leasetest PRIVATE vmsig Threads::Threads) +target_include_directories(vmsig_leasetest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/ctx/include) +target_compile_options(vmsig_leasetest PRIVATE -Wall -Wextra) +add_test(NAME lease COMMAND vmsig_leasetest) + +add_executable(vmsig_inputobstest src/test/test_inputobs.c) +target_link_libraries(vmsig_inputobstest PRIVATE vmsig Threads::Threads) +target_compile_options(vmsig_inputobstest PRIVATE -Wall -Wextra) +add_test(NAME inputobs COMMAND vmsig_inputobstest) + +add_executable(vmsig_memwritetest src/test/test_memwrite.c) +target_link_libraries(vmsig_memwritetest PRIVATE vmsig Threads::Threads) +target_include_directories(vmsig_memwritetest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/memctx/include) +target_compile_options(vmsig_memwritetest PRIVATE -Wall -Wextra) +add_test(NAME memwrite COMMAND vmsig_memwritetest) + +# the demonstrator doubles as an end-to-end seam test (self-terminates rc=0) +add_test(NAME cli COMMAND vmsig_cli) diff --git a/include/vmsig.h b/include/vmsig.h new file mode 100644 index 0000000..465c461 --- /dev/null +++ b/include/vmsig.h @@ -0,0 +1,21 @@ +#ifndef VMSIG_H +#define VMSIG_H + +/* vmsig.h — umbrella header for the signaling layer of the SISC paradigm. + * + * vmsig binds the three SI repos (sensors vmie/vgpustream + input vmctl) to the control + * (algorithm OR human), bidirectionally translating transfer events. The layer itself + * contains NO sensing, actuation, or decision/behavioral-timing logic. + * + * Application wiring: create the context (vmsig_ctx_new) -> core (vmsig_core_new) -> + * register per-VM adapters (vmsig_core_add_adapter) and control + * (vmsig_core_add_control) -> vmsig_core_run. */ + +#include "vmsig_event.h" /* neutral transfer-event model + payload */ +#include "vmsig_memctx.h" /* address-space context handoff contract (kcr3+locator) */ +#include "vmsig_ctx.h" /* transfer context: priority/seq/protocol timing */ +#include "vmsig_adapter.h" /* unified SI adapter interface + factories */ +#include "vmsig_control.h" /* control-agnostic seam + reference in-proc */ +#include "vmsig_core.h" /* epoll core */ + +#endif /* VMSIG_H */ diff --git a/include/vmsig_adapter.h b/include/vmsig_adapter.h new file mode 100644 index 0000000..55a450b --- /dev/null +++ b/include/vmsig_adapter.h @@ -0,0 +1,81 @@ +#ifndef VMSIG_ADAPTER_H +#define VMSIG_ADAPTER_H +#include "vmsig_event.h" +#include "vmsig_memctx.h" /* vmsig_memctx_reg — address-space context registration seam */ + +/* vmsig_adapter.h — unified SI adapter interface. One vtable, three readiness + * shapes behind it. The adapter is the ONLY place that includes neighbor headers + * (memmodel.h/win32.h/vgpu_stream.h/vmctl.h). It registers 0..N fds with the core; + * the core does not know whether it is a socket, eventfd or timerfd. SI specifics + * never leave these functions. */ + +typedef struct vmsig_adapter vmsig_adapter; /* opaque adapter instance */ + +/* How the adapter expresses readiness. The core treats all three as ordinary + * epoll fds; the enum is documentation + the choice of default epoll flags. */ +typedef enum { + VMSIG_RDY_FD = 0, /* native pollable fd (socket) */ + VMSIG_RDY_TIMERFD = 1, /* timerfd; adapter samples shared memory */ + VMSIG_RDY_EVENTFD = 2 /* worker thread bridges a blocking API -> eventfd */ +} vmsig_readiness; + +/* Sink handed by the core to the adapter for emitting UP events without knowing + * the internals of the context. emit() is thread-safe (also called from worker + * threads); register_memctx/unregister_memctx are called ONLY on the loop thread. + * The registration hooks may be NULL (adapters/tests need not call them). */ +typedef struct { + int (*emit)(void* token, vmsig_event* ev); /* UP (thread-safe) */ + int (*register_memctx)(void* token, const vmsig_memctx_reg* reg); /* loop thread: AS context; 0/-1 */ + void (*unregister_memctx)(void* token, uint32_t endpoint); /* loop thread: context gone */ + void* token; +} vmsig_emit; + +/* One fd contributed by the adapter, with epoll flags and a cookie for demux. */ +typedef struct { + int fd; + uint32_t epoll_events; /* EPOLLIN / EPOLLOUT / ... */ + vmsig_readiness shape; + uint32_t cookie; /* adapter-private fd discriminator */ +} vmsig_fd_reg; + +/* Adapter vtable. Each SI adapter implements this; SI specifics do not leak. */ +typedef struct vmsig_adapter_ops { + const char* name; /* "memctx"/"input"/"vmhost" — diagnostics */ + vmsig_source source; /* neutral seam role */ + uint32_t codec; /* vmsig_codec owned by the adapter */ + + /* Create an instance from opaque cfg (adapter parses it; core passes as-is). + * Returns an instance or NULL. `endpoint` is the id of the VM it binds to. */ + vmsig_adapter* (*open)(const void* cfg, uint32_t endpoint); + + /* Attach: open the SI contract, bring up workers, hand fds into reg[] + * (<=cap), store `emit` for UP. Returns the number of registered fds (>=0) / -1. */ + int (*attach)(vmsig_adapter* a, const vmsig_emit* emit, + vmsig_fd_reg* reg, int cap); + + /* Readiness of one of the adapter's fds: `cookie` identifies the fd, `events` + * are the epoll flags. The adapter does NON-blocking work (reads the socket / + * drains the eventfd / reads the timerfd + samples counters) and calls emit on + * each UP. 0 — ok, -1 — fatal (the core detaches the adapter). */ + int (*on_readiness)(vmsig_adapter* a, uint32_t cookie, uint32_t events); + + /* Consume a DOWN event (a control decision): encode it into the contract + * (vmctl_batch / vmctl power; write the vgpu control block; read request to vmie). + * For blocking sinks it hands the work to a worker and returns immediately; + * completion arrives later as an UP VMSIG_EV_ACT_ACK (keyed by ev->corr). + * 0 — accepted, 1 — rejected (not for this seam), -1 — error. */ + int (*submit)(vmsig_adapter* a, const vmsig_event* ev); + + /* Detach + free: stop workers, close SI handles and fds. */ + void (*close)(vmsig_adapter* a); +} vmsig_adapter_ops; + +/* Factories (defined in each adapter's TU — the only symbol the build/cli layer + * needs; keeps neighbor headers out of the core's include-path). */ +const vmsig_adapter_ops* vmsig_memctx_ops(void); /* vmie: address-space context (kcr3+locator) */ +const vmsig_adapter_ops* vmsig_input_ops(void); /* vmctl */ +const vmsig_adapter_ops* vmsig_vmhost_ops(void); /* QEMU/QMP (its own signaling) */ +/* (vgpu frame sensor is no longer a signaling adapter: vgpu perception lives in an + * out-of-repo S-lib that consumes memctx; see vgpu-perception-handoff.) */ + +#endif /* VMSIG_ADAPTER_H */ diff --git a/include/vmsig_control.h b/include/vmsig_control.h new file mode 100644 index 0000000..548b81e --- /dev/null +++ b/include/vmsig_control.h @@ -0,0 +1,126 @@ +#ifndef VMSIG_CONTROL_H +#define VMSIG_CONTROL_H +#include "vmsig_event.h" + +/* vmsig_control.h — control-agnostic seam. Control (an algorithm OR a human) + * attaches via ONE neutral interface: a command queue (down) + an event + * subscription (up). In-process implements the vtable with direct callbacks + * (fd = -1); out-of-process is a socket whose fd is registered with the core like + * any source. The core treats both the same. Orchestration is NOT wired in here — + * only the seam. */ + +/* Subscription filter: which UP events the control WANTS. This is only a + * NARROWING; the real ceiling is set by the grant (effective = sub ∩ grant). */ +typedef struct { + uint32_t source_mask; /* bit (1u< not a valid poller (receives and + * sends nothing). */ +#define VMSIG_CAP_OBSERVE 0x1u /* UP of SEAM/generic coherent state (observation) */ +#define VMSIG_CAP_INPUT 0x2u /* CMD_INPUT */ +/* (0x4 is the freed bit of the removed CAP_STREAM; the future vgpu-control down-path + * returns via write-signaled/MEMWRITE. Do NOT reuse.) */ +#define VMSIG_CAP_LIFECYCLE 0x8u /* CMD_LIFECYCLE safe ones (pause/resume/wakeup) */ +/* (0x10 is the freed bit of the removed CAP_MEMREAD; do NOT reuse: a stale grant + * with this bit must not silently alias to the privileged memory cap.) */ +#define VMSIG_CAP_POWER 0x20u /* destructive lifecycle/VM (powerdown/reset/quit) */ +#define VMSIG_CAP_VM 0x40u /* CMD_VM safe ones (query/cont/stop), VMHOST seam */ +#define VMSIG_CAP_MEMCTX 0x80u /* SUBSCRIPTION to a coherent AS context (UP MEMCTX*, re-share RO-fd). + * NOT an access broker (that is OS-DAC on the fd) — gates RECEIVING the datum. */ +#define VMSIG_CAP_MEMWRITE 0x100u /* CMD_MEMWRITE: atomic write-signaled mutation of shared guest memory + * (separate from the freed CAP_MEMREAD bit — read != write; fresh bit + * avoids stale-grant aliasing to this privileged cap). */ + +typedef struct { + uint32_t principal; /* id for auditing (uid/token) */ + uint64_t endpoint_mask; /* which VMs (bit 1ull< + * PREEMPT, otherwise DENY. */ +typedef enum { + VMSIG_ARB_DENY = 0, /* deny the contender, the owner keeps it */ + VMSIG_ARB_PREEMPT = 1 /* take it from the owner, give it to the contender (QUEUE — reserved) */ +} vmsig_arb_decision; + +/* Called ONLY when (endpoint,class) is held by a LIVE owner (incumbent) and an + * ACQUIRE arrives from another contender. incumbent/contender are the parties' + * grants (live, not copies); incumbent is NEVER NULL (a dead owner is treated as a + * free slot and policy is not called). Called on the loop thread. */ +typedef vmsig_arb_decision (*vmsig_arb_policy)(void* ud, uint32_t endpoint, uint32_t cls, + const vmsig_grant* incumbent, + const vmsig_grant* contender); + +/* Control endpoint vtable. The core calls deliver() for UP; control sends DOWN via + * the emit hook that the core installs in set_emit_down(). */ +typedef struct vmsig_control_ops { + const char* name; + + /* fd for an out-of-process control (socket). -1 => in-process, callbacks only + * (no registration in epoll). */ + int (*fd)(void* ctl); + + /* Declare interest (called once at attach). */ + int (*subscribe)(void* ctl, vmsig_sub* out); + + /* Core -> control: an UP event for the subscriber. For in-process, a direct + * call; for socket-control, serialization onto the wire. Borrowed: whatever + * must outlive the call must be copied. */ + int (*deliver)(void* ctl, const vmsig_event* ev); + + /* Core -> control (socket only): the control-fd is readable; the implementation + * parses the wire into DOWN events and calls the installed down-emit. */ + int (*on_readable)(void* ctl); + + /* The core installs the hook by which control sends DOWN commands; the core + * routes them into vmsig_ctx_submit(ctx, VMSIG_DIR_DOWN, ev). */ + void (*set_emit_down)(void* ctl, int (*emit)(void* token, vmsig_event*), + void* token); + + void (*close)(void* ctl); + + /* Core -> control: deliver a coherent address-space context (UP MEMCTX) + RO-fd + * of the RAM region. Socket: a vmsig_wire frame (kind=MEMCTX, inln=vmsig_memctx) + fd in cmsg + * (SCM_RIGHTS); the segs payload does NOT go on the wire (the holder opens + * via `low`). In-proc: direct fd + event (segs in payload, decode with vmsig_memctx_segs). + * The fd is BORROWED for the duration of the call (the core closes it afterwards) — the holder + * dup's/mmap's it to keep it. Optional: NULL => control does not accept MEMCTX. 0/-1. */ + int (*attach_memctx)(void* ctl, const vmsig_event* ev, int fd); +} vmsig_control_ops; + +/* Reference in-process control: a thin shim turning a C callback into a vtable, for + * embedding an algorithm directly. */ +typedef struct { + int (*on_event)(void* user, const vmsig_event* up); /* core -> algorithm */ + void* user; + vmsig_sub sub; /* subscription filter */ + /* Core -> algorithm: a coherent AS context (UP MEMCTX) + RO-fd as a direct int. The fd + * is borrowed (dup/mmap to keep it). NULL => does not accept. 0/-1. */ + int (*on_memctx)(void* user, const vmsig_event* ev, int fd); +} vmsig_inproc_cfg; + +/* Create a reference in-proc control over cfg (which is copied). Returns an opaque + * ctl for vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl). Freed via + * ops->close(ctl). NULL on OOM. */ +const vmsig_control_ops* vmsig_inproc_control_ops(void); +void* vmsig_inproc_control_new(const vmsig_inproc_cfg* cfg); + +/* Send a DOWN command from an in-proc control (after attach). 0 — ok, -1 — error. */ +int vmsig_inproc_send(void* ctl, vmsig_event* down); + +#endif /* VMSIG_CONTROL_H */ diff --git a/include/vmsig_core.h b/include/vmsig_core.h new file mode 100644 index 0000000..01c04a6 --- /dev/null +++ b/include/vmsig_core.h @@ -0,0 +1,70 @@ +#ifndef VMSIG_CORE_H +#define VMSIG_CORE_H +#include "vmsig_event.h" +#include "vmsig_ctx.h" +#include "vmsig_adapter.h" +#include "vmsig_control.h" + +/* vmsig_core.h — non-blocking epoll core. It knows a single vocabulary: "here is + * an fd — call the neutral handler on readiness; the handler produces/consumes + * neutral events". All neighbor mechanisms are just different ways to spawn an + * fd. The core structurally cannot name a neighbor's type: neighbor headers are + * visible only from the adapter TUs. */ + +typedef struct vmsig_core vmsig_core; + +/* Create the core over a transfer context (the core does NOT own ctx; ctx's + * lifetime must cover the core). NULL on error. */ +vmsig_core* vmsig_core_new(vmsig_ctx* ctx); + +/* Stop, detach all adapters/control, free. Safe on NULL. */ +void vmsig_core_free(vmsig_core* c); + +/* ===== Audit (observability of admissions/denials) ===== */ +typedef enum { + VMSIG_AUDIT_ADMIT = 0, /* poller admitted (socket accept) */ + VMSIG_AUDIT_REJECT = 1, /* poller rejected at accept (empty grant) */ + VMSIG_AUDIT_DOWN_DENIED = 2, /* DOWN command denied by grant/cap */ + /* --- lease arbitration --- */ + VMSIG_AUDIT_LEASE_GRANTED = 3, /* lease granted/preempted */ + VMSIG_AUDIT_LEASE_DENIED = 4, /* ACQUIRE denied OR destructive dropped by lease gate */ + VMSIG_AUDIT_LEASE_REVOKED = 5, /* lease revoked by preemption */ + VMSIG_AUDIT_LEASE_RECLAIMED = 6, /* lease reclaimed on owner death (reclaim) */ + VMSIG_AUDIT_MEMCTX_GRANTED = 7 /* address-space context granted/replayed to holder */ +} vmsig_audit_kind; + +typedef struct { + vmsig_audit_kind kind; + uint32_t principal; /* uid/token (grant.principal or peer uid) */ + uint32_t endpoint; + uint32_t cmd; /* vmsig_kind for DOWN_DENIED */ + uint32_t detail; /* extra (e.g. peer pid) */ +} vmsig_audit; + +/* Set the audit callback (NULL = off). Called on the loop thread. */ +void vmsig_core_set_audit(vmsig_core* c, + void (*cb)(void* ud, const vmsig_audit* a), void* ud); + +/* Set the lease arbitration policy (NULL => default: contender.arb_prio > + * incumbent.arb_prio ? PREEMPT : DENY). Called on the loop thread. */ +void vmsig_core_set_arb_policy(vmsig_core* c, vmsig_arb_policy cb, void* ud); + +/* Register an adapter for VM `endpoint`: open(cfg,endpoint) -> attach(...), + * enroll each yielded fd into epoll and into the dispatch table fd->(adapter,cookie). + * Returns the adapter id (>=0) or -1. */ +int vmsig_core_add_adapter(vmsig_core* c, const vmsig_adapter_ops* ops, + const void* cfg, uint32_t endpoint); + +/* Attach a control endpoint (in-process or socket) with a GRANT (capability set). + * grant == NULL => default-deny (poller inert). The core sees only the neutral + * vtable + grant + (opt.) fd. Returns the control id (>=0) or -1. */ +int vmsig_core_add_control(vmsig_core* c, const vmsig_control_ops* ops, void* ctl, + const vmsig_grant* grant); + +/* Spin the loop until a stop is requested. 0 — clean, -1 — fatal. */ +int vmsig_core_run(vmsig_core* c); + +/* Asynchronous, signal-safe stop request: writes the wakeup eventfd. */ +void vmsig_core_stop(vmsig_core* c); + +#endif /* VMSIG_CORE_H */ diff --git a/include/vmsig_ctx.h b/include/vmsig_ctx.h new file mode 100644 index 0000000..1bcc32d --- /dev/null +++ b/include/vmsig_ctx.h @@ -0,0 +1,48 @@ +#ifndef VMSIG_CTX_H +#define VMSIG_CTX_H +#include "vmsig_event.h" + +/* vmsig_ctx.h — the "transfer context": the SISC-critical seam owning PRIORITY, + * SEQUENCING and PROTOCOL timing of delivery. Behavioral timing does NOT belong + * here — commands arrive already decided from control; the context merely + * orders and paces them on the "wire". */ + +typedef struct vmsig_ctx vmsig_ctx; /* opaque: queues, seq, timing */ + +/* Protocol (RS232-like) transmission timings — transport ONLY, not behavior. + * All zeros = pass-through (no pacing). */ +typedef struct { + uint32_t min_gap_ns; /* min. gap between channel events (rate-cap) */ + uint32_t coalesce_ns; /* collapse bursts of one kind within a window */ + uint32_t max_inflight; /* backpressure depth on a channel before drop */ + uint8_t drop_policy; /* VMSIG_DROP_* */ +} vmsig_timing; + +#define VMSIG_DROP_OLDEST 0 +#define VMSIG_DROP_NEWEST 1 +#define VMSIG_DROP_BLOCK 2 + +vmsig_ctx* vmsig_ctx_new(void); +void vmsig_ctx_free(vmsig_ctx* c); + +/* Policy per (source,dir): default priority + protocol timing. They live + * here, NOT in adapters and NOT in control. */ +int vmsig_ctx_set_policy(vmsig_ctx* c, vmsig_source src, vmsig_dir dir, + vmsig_prio default_prio, const vmsig_timing* t); + +/* Enqueue an event into the `dir`-direction context (assigns seq, applies + * priority/timing/coalescing/backpressure). 0 — enqueued, 1 — + * coalesced/dropped by policy, -1 — error. On success takes ownership of + * ev->payload. Thread-safe (the UP side is called from worker threads). */ +int vmsig_ctx_submit(vmsig_ctx* c, vmsig_dir dir, vmsig_event* ev); + +/* Fetch the next event of direction `dir` ready for delivery, honoring + * priority + protocol timing. 1 — event written to out, 0 — nothing yet + * (caller arms timing_fd), -1 — error. */ +int vmsig_ctx_next(vmsig_ctx* c, vmsig_dir dir, vmsig_event* out); + +/* timerfd by which the context wakes the loop when a paced/coalesced event + * has matured. Registered in the core like any source. -1 if not needed. */ +int vmsig_ctx_timing_fd(vmsig_ctx* c, vmsig_dir dir); + +#endif /* VMSIG_CTX_H */ diff --git a/include/vmsig_event.h b/include/vmsig_event.h new file mode 100644 index 0000000..38aa0b8 --- /dev/null +++ b/include/vmsig_event.h @@ -0,0 +1,280 @@ +#ifndef VMSIG_EVENT_H +#define VMSIG_EVENT_H +#include +#include + +/* vmsig_event.h — neutral "transfer event" + "payload" model. + * + * This is the ONLY type that crosses the signaling core. The taxonomy names the + * transfer SEMANTICS, not neighbor types: a TU compiled against this header + * cannot name vmctl_batch, vgpu_producer_t, or vmie_mem. The SI data body lives + * in an opaque payload owned by the source adapter's codec; the core does NOT + * dereference it — it only routes the event and carries the payload. */ + +/* Transfer direction relative to control. */ +typedef enum { + VMSIG_DIR_UP = 0, /* sensor/state -> control */ + VMSIG_DIR_DOWN = 1 /* control decision -> actuation/SI */ +} vmsig_dir; + +/* Logical seam (SI role) the event crosses. NEUTRAL roles, not driver names: + * assigned at adapter registration, used only for routing, the priority default, + * and the subscription filter. */ +typedef enum { + VMSIG_SRC_NONE = 0, + VMSIG_SRC_FRAME = 1, /* vgpu desktop sensor role; reserved: no signaling adapter, + * the future vgpu-perception shell-as-control carries it (CURSOR_STATE) */ + VMSIG_SRC_INPUT = 2, /* input/actuation + lifecycle (vmctl role) */ + VMSIG_SRC_CONTROL = 3, /* originated by a control endpoint */ + VMSIG_SRC_CORE = 4, /* core-internal (shutdown/error/tick) */ + VMSIG_SRC_VMHOST = 5, /* VM substrate / QEMU: lifecycle + events (own QMP) */ + VMSIG_SRC_MEMCTX = 6, /* coherent guest address-space context (kcr3+locator) */ + VMSIG_SRC_MAX +} vmsig_source; + +/* Delivery priority class. Higher value — earlier delivery. This is NOT a + * behavioral timing but ordering on the "wire". The default is assigned per + * source at registration; the emitter may override it per event. */ +typedef enum { + VMSIG_PRIO_BULK = 0, /* frames, large state deltas */ + VMSIG_PRIO_NORMAL = 1, /* routine ack/samples */ + VMSIG_PRIO_HIGH = 2, /* input commands (latency-sensitive) */ + VMSIG_PRIO_URGENT = 3, /* lifecycle, seam-down, errors */ + VMSIG_PRIO_MAX +} vmsig_prio; + +/* NEUTRAL event taxonomy: each kind is a transfer MEANING that exactly one + * adapter codec decodes from / encodes into its contract. The core routes by + * kind + source + dir + prio and does not interpret the payload. */ +typedef enum { + /* --- generic / lifecycle (any seam) --- */ + VMSIG_EV_NONE = 0, + VMSIG_EV_SEAM_UP = 1, /* SI seam came up (attach/bootstrap ok) */ + VMSIG_EV_SEAM_DOWN = 2, /* seam lost (heartbeat stale, socket closed) */ + VMSIG_EV_ERROR = 3, /* adapter/core error, details in payload */ + + /* (16..19 — retired STATE_* of the MEMSTATE seam; do NOT reuse numbers: on a + * version skew an old STATE kind must not alias a new kind on the wire.) */ + + /* (32..36 — retired FRAME_READY/FRAME_STATE/BULK_ATTACHED/BULK_READY/BULK_DETACHED of + * the removed FRAME adapter + bulk data-plane (vgpu perception moved to an S-lib); + * do NOT reuse numbers — wire-skew safety.) */ + + /* --- UP: cursor (vgpu sensor; emitted by the vgpu-perception shell-as-control) --- */ + VMSIG_EV_CURSOR_STATE = 37, /* cursor position/visibility; inln=vmsig_cursor; cap OBSERVE|INPUT */ + + /* --- UP: input/lifecycle ack (INPUT seam) --- */ + VMSIG_EV_ACT_ACK = 48, /* down-command completed (ok/err) */ + VMSIG_EV_VM_LIFECYCLE = 49, /* power/lifecycle state report */ + + /* --- UP: lease arbitration (all addressed, origin=initiator; source=CORE) --- */ + VMSIG_EV_LEASE_GRANTED = 50, /* lease granted (CMD_ACQUIRE succeeded) */ + VMSIG_EV_LEASE_DENIED = 51, /* lease denied (reason in vmsig_lease_req) */ + VMSIG_EV_LEASE_RELEASED= 52, /* lease released by owner (CMD_RELEASE) */ + VMSIG_EV_LEASE_REVOKED = 53, /* lease taken away by preemption/death */ + VMSIG_EV_LEASE_STATUS = 54, /* response to CMD_LEASE_STATUS (vmsig_lease_status) */ + + /* --- UP: response to a held-input query (INPUT seam, addressed to initiator) --- */ + VMSIG_EV_INPUT_HELD = 55, /* set of held KEY/BTN from the vmctl record; inln=vmsig_input_held */ + + /* --- DOWN: control decisions --- */ + VMSIG_EV_CMD_INPUT = 64, /* input injection (abs/rel/btn/key/scroll) */ + VMSIG_EV_CMD_LIFECYCLE = 65, /* powerdown/reset/wakeup/pause/resume */ + /* (66 — retired CMD_STREAM of the removed FRAME adapter; the future vgpu-control + * down-path returns via write-signaled/MEMWRITE. 67..69 — retired + * CMD_QUERY/WATCH/UNWATCH; do NOT reuse numbers.) */ + VMSIG_EV_CMD_VM = 70, /* base VM control (vmsig_vm_cmd; VMHOST seam) */ + /* (71..72 — retired CMD_SUBSCRIBE_BULK/UNSUBSCRIBE_BULK of the bulk data-plane; + * do NOT reuse numbers.) */ + + /* --- DOWN: lease arbitration (intercepted by the core, not forwarded to the adapter) --- */ + VMSIG_EV_CMD_ACQUIRE = 73, /* request an exclusive lease of a class: inln=vmsig_lease_req */ + VMSIG_EV_CMD_RELEASE = 74, /* release your own lease of a class: inln=vmsig_lease_req */ + VMSIG_EV_CMD_LEASE_STATUS = 75, /* query lease status of a class: inln=vmsig_lease_req */ + VMSIG_EV_CMD_QUERY_INPUT = 76, /* query held KEY/BTN (from the vmctl record); reply UP INPUT_HELD; cap INPUT */ + + /* --- UP: address-space context (MEMCTX seam; coherent kcr3+locator datum) --- */ + VMSIG_EV_MEMCTX = 77, /* context multicast/replay: inln=vmsig_memctx, + * payload=vmsig_memseg[] (owned), RO-fd alongside */ + VMSIG_EV_MEMCTX_INVALIDATED = 78, /* epoch invalidation: inln=vmsig_memctx_inv (URGENT) */ + + /* --- DOWN: coherent memory write (write-signaled; MEMCTX seam) --- */ + VMSIG_EV_CMD_MEMWRITE = 79, /* atomic gva_write under the held lease; inln=vmsig_memwrite (+tail/payload bytes); + * cap MEMWRITE + lease MEMWRITE + extent. ACK via ACT_ACK{ok,corr}. */ + VMSIG_EV_KIND_MAX +} vmsig_kind; + +/* ===== Lease arbitration (exclusive-ownership layer for destructive resources) ===== + * A destructive VM resource is owned by EXACTLY one control (per endpoint+class pair). + * The class is generic; INPUT, POWER and MEMWRITE are active. MEMWRITE is the + * write-signaled atomic guest-memory write on the MEMCTX seam. */ +typedef enum { + VMSIG_LEASE_INPUT = 0, /* exclusive grab of input (CMD_INPUT) */ + VMSIG_LEASE_POWER = 1, /* exclusive destructive power (lifecycle/VM) */ + VMSIG_LEASE_MEMWRITE = 2, /* exclusive atomic guest-memory write (gva_write); NO finalization */ + VMSIG_LEASE_CLASS_MAX +} vmsig_lease_class; + +/* Lease denial reason (vmsig_lease_req.reason in UP LEASE_DENIED). */ +enum { + VMSIG_LEASE_DENY_HELD = 0, /* held by an equal/higher; the owner holds it */ + VMSIG_LEASE_DENY_NOCAP = 1, /* no cap for the class (CAP_INPUT/CAP_POWER) */ + VMSIG_LEASE_DENY_NOGRANT = 2, /* endpoint outside the grant (endpoint_mask) */ + VMSIG_LEASE_DENY_BADCLASS = 3, /* class out of range */ + VMSIG_LEASE_DENY_LOWER_PRIO = 4 /* contender priority not above the owner's */ +}; + +/* Lease request/response (DOWN CMD_ACQUIRE/RELEASE/LEASE_STATUS and UP LEASE_*, in inln). */ +typedef struct { + uint32_t cls; /* vmsig_lease_class */ + uint32_t reason; /* DOWN: 0; UP LEASE_DENIED: VMSIG_LEASE_DENY_* */ +} vmsig_lease_req; + +/* Response to CMD_LEASE_STATUS (UP LEASE_STATUS, in inln). */ +typedef struct { + uint32_t cls; /* requested class */ + uint32_t busy; /* 1=held by a live owner, 0=free */ + uint32_t owner_principal; /* owner principal (for audit/UI); 0 if free */ +} vmsig_lease_status; + +/* Lifecycle operations for CMD_LIFECYCLE (code in inln[0]). Destructive ones + * (POWERDOWN/RESET) require CAP_POWER; safe ones — CAP_LIFECYCLE. */ +enum { + VMSIG_LIFE_POWERDOWN = 0, + VMSIG_LIFE_RESET = 1, + VMSIG_LIFE_WAKEUP = 2, + VMSIG_LIFE_PAUSE = 3, + VMSIG_LIFE_RESUME = 4 +}; + +/* ===== Input (DOWN VMSIG_EV_CMD_INPUT, in inln) — NEUTRAL ===== + * control describes input abstractly (axis/button/key/scroll), WITHOUT knowing the driver + * (uinput/QMP): the input adapter translates it into its contract. Requires CAP_INPUT. This + * is the ONLY public input-encoding contract — an external control encodes vmsig_input into + * vmsig_event.inln. */ +typedef enum { + VMSIG_INPUT_ABS = 0, /* absolute axis: code=axis, value=coordinate */ + VMSIG_INPUT_REL = 1, /* relative axis: code=axis, value=delta */ + VMSIG_INPUT_BTN = 2, /* button: code=button, value=pressed(1)/released(0) */ + VMSIG_INPUT_KEY = 3, /* key: code=evdev code, value=pressed/released */ + VMSIG_INPUT_SCROLL = 4 /* scroll: code=axis, scroll=magnitude */ +} vmsig_input_kind; + +typedef struct { + uint32_t kind; /* vmsig_input_kind */ + int32_t code; /* axis / button / evdev code (neutral event code) */ + int32_t value; /* abs coordinate / rel delta / pressed(1)|released(0) */ + double scroll; /* scroll magnitude (VMSIG_INPUT_SCROLL only) */ +} vmsig_input; /* fits in vmsig_event.inln[48] */ + +/* ===== Memory write (DOWN VMSIG_EV_CMD_MEMWRITE) — NEUTRAL, write-signaled ===== + * control describes an ATOMIC write into guest memory abstractly (guest VA + length), + * WITHOUT knowing vmie/cr3: the memctx adapter resolves it under the held kcr3 and does + * ONE gva_write. Requires CAP_MEMWRITE + an exclusive MEMWRITE lease + an extent check. + * SRC bytes: inline (<= VMSIG_MEMWRITE_INLINE) ride in the inln tail right after this header + * (flags & INLINE); larger in-proc writes ride in the borrowed payload (flags & PAYLOAD). */ +#define VMSIG_MEMWRITE_INLINE 32u /* inln tail capacity for SRC (48 - 16 header) */ +#define VMSIG_MW_SRC_INLINE 0x1u /* SRC bytes are in inln tail (len<=INLINE) */ +#define VMSIG_MW_SRC_PAYLOAD 0x2u /* SRC bytes are in ev->payload.data (in-proc) */ +typedef struct { + uint64_t gva; /* guest virtual address to write (resolved under the adapter's kcr3) */ + uint32_t len; /* number of bytes to write (1..VMSIG_MEMWRITE_MAX) */ + uint32_t flags; /* VMSIG_MW_SRC_INLINE | VMSIG_MW_SRC_PAYLOAD */ + /* inline SRC tail (when VMSIG_MW_SRC_INLINE): up to VMSIG_MEMWRITE_INLINE bytes follow */ +} vmsig_memwrite; /* header = 8+4+4 = 16 bytes; +32 tail = 48 (exactly inln[48]) */ + +/* ===== Cursor (UP VMSIG_EV_CURSOR_STATE, in inln) — NEUTRAL ===== + * Cursor position from the SCREEN sensor (vgpu). NEUTRAL payload format only: emitted by the + * out-of-repo vgpu-perception shell-as-control (source VMSIG_SRC_FRAME), not by a signaling + * adapter — signaling just fans it out. x,y signed (multi-monitor -> negative). cap OBSERVE|INPUT. */ +typedef struct { + int32_t x; /* screen coordinate X (signed) */ + int32_t y; /* screen coordinate Y (signed) */ + uint32_t visible; /* 1=shown, 0=hidden */ + uint32_t seq; /* monotonic cursor-publication counter (vgpu) */ +} vmsig_cursor; + +/* ===== Held input (UP VMSIG_EV_INPUT_HELD, in inln) — response to CMD_QUERY_INPUT ===== + * Set of held KEY/BTN from the ACTUATOR record (vmctl): signaling only returns it on request, + * does NOT track it itself and does NOT decide release (that is control). flags & TRUNC => more + * held than ent. */ +#define VMSIG_INPUT_HELD_TRUNC 0x1u +typedef struct { + uint32_t count; /* number of valid entries in ent[] */ + uint32_t flags; /* VMSIG_INPUT_HELD_TRUNC if more held than capacity */ + struct { uint16_t kind; uint16_t code; } ent[10]; /* kind=VMSIG_INPUT_KEY/BTN; code */ +} vmsig_input_held; /* 4+4+10*4 = 48 (exactly inln[48]) */ + +/* ===== QEMU/QMP host-plane (VMHOST seam) — VM-substrate control ===== + * VM state (UP VMSIG_EV_VM_LIFECYCLE, in inln). */ +enum { + VMSIG_VM_RUNNING = 0, VMSIG_VM_PAUSED, VMSIG_VM_SHUTDOWN, + VMSIG_VM_RESET, VMSIG_VM_POWERDOWN, VMSIG_VM_CRASHED, VMSIG_VM_UNKNOWN +}; +typedef struct { uint32_t state; uint32_t detail; } vmsig_vm_state; + +/* VM control operations (DOWN VMSIG_EV_CMD_VM, in inln). Destructive ones + * (RESET/POWERDOWN/QUIT) require CAP_POWER; safe ones — CAP_VM. */ +enum { + VMSIG_VMOP_QUERY = 0, /* query-status */ + VMSIG_VMOP_CONT, /* cont (resume) */ + VMSIG_VMOP_STOP, /* stop (pause) */ + VMSIG_VMOP_RESET, /* system_reset (destructive) */ + VMSIG_VMOP_POWERDOWN, /* system_powerdown (destructive) */ + VMSIG_VMOP_QUIT /* quit (destructive) */ +}; +typedef struct { uint32_t op; } vmsig_vm_cmd; + +/* Codec tags: which adapter owns the payload body (for release/diagnostics). */ +typedef enum { + VMSIG_CODEC_NONE = 0, + VMSIG_CODEC_INPUT = 1, + VMSIG_CODEC_VMHOST = 2, + VMSIG_CODEC_MEMCTX = 3 /* owned-payload locator (vmsig_memseg[]) of the MEMCTX seam */ +} vmsig_codec; + +/* Payload ownership flags. */ +#define VMSIG_PL_OWNED 0x1u /* core frees it via release() on drop */ +#define VMSIG_PL_BORROWED 0x2u /* borrowed (e.g. a seqlock frame): copy */ + /* or revalidate before release() */ +#define VMSIG_PL_INLINE 0x4u /* small body lives in vmsig_event.inln */ + +/* Opaque, releasable payload. The body is owned by the emitting adapter's codec + * (mmap'd frame slot, vmie heap diff, ...). The core carries the bearer and calls + * release() EXACTLY once on consumption/drop. The core never dereferences data. */ +typedef struct vmsig_payload { + void* data; /* opaque body, codec-defined */ + size_t len; /* bytes in data (0 if borrowed) */ + uint32_t codec; /* vmsig_codec: whose payload it is */ + uint32_t flags; /* VMSIG_PL_* */ + void (*release)(struct vmsig_payload*); /* idempotent; may be NULL */ + void* owner; /* codec context for release() */ +} vmsig_payload; + +/* TRANSFER EVENT. Fixed-size header + a small inline zone; large bodies hang off + * the payload. */ +typedef struct vmsig_event { + vmsig_kind kind; + vmsig_source source; /* source seam */ + vmsig_dir dir; + vmsig_prio prio; + uint32_t endpoint; /* VM/endpoint id — multi-VM-ready */ + uint32_t seq; /* monotonic sequence (set by the context) */ + uint32_t corr; /* correlation: links an ACK to its CMD */ + uint32_t origin; /* INTERNAL: id+1 of the control that initiated DOWN (0=none/broadcast). */ + /* Set by the core in emit_down; NOT serialized onto the wire */ + /* (a poller cannot forge it). Addressed reply delivery. */ + uint64_t ts_ns; /* CLOCK_MONOTONIC at emit time */ + vmsig_payload payload; /* opaque body (may be empty) */ + uint8_t inln[48]; /* inline zone for small events (VMSIG_PL_INLINE) */ +} vmsig_event; + +/* Release the event's payload (if it has release and is not yet freed). Idempotent. */ +static inline void vmsig_payload_release(vmsig_event* ev) { + if (ev && ev->payload.release) { + ev->payload.release(&ev->payload); + ev->payload.release = NULL; + } +} + +#endif /* VMSIG_EVENT_H */ diff --git a/include/vmsig_memctx.h b/include/vmsig_memctx.h new file mode 100644 index 0000000..5b07985 --- /dev/null +++ b/include/vmsig_memctx.h @@ -0,0 +1,101 @@ +#ifndef VMSIG_MEMCTX_H +#define VMSIG_MEMCTX_H +#include +#include +#include "vmsig_event.h" + +/* vmsig_memctx.h — NEUTRAL handoff contract for the guest address-space context. + * + * signaling is a COHERENCE layer for shared state, not perception. Over memory it + * vends ONE coherent datum: the root of the guest address space (the permanent System + * DirectoryTableBase, `kcr3`) PAIRED with a RAM-region locator — a pre-opened `O_RDONLY` + * fd. The holder (an S library / any control, including a human operator via their shim) + * subscribes to this datum, opens ITS OWN read-only context FROM the received fd (keyed on + * `kcr3`), and does proc_list/gva_read/scan/pmap itself. Perception and semantics are NOT here. + * + * Holder invariants: + * - The locator is valid ONLY against the received `O_RDONLY` fd. From it the holder opens + * its own read-only context, keyed on the vended `kcr3`: + * * raw reads under a cr3 it already holds — vmie_mem_from_ro_fd(fd, low) (nseg==0) or + * vmie_mem_from_ro_fd_segs(fd, segs, nseg) (nseg>0); gva_read keyed on (mem, kcr3); + * * FULL read context WITH process/module discovery — vmie_win32_open_ro_fd(fd, low, + * kcr3): builds the offset profile read-only from the image (no beacon/ACK) and + * enables proc_list/proc_modules plus the section/import/export/scan surfaces. A + * sensor that must FIND a process (then read its private AS) needs this one — kcr3 + * alone gives reads-under-a-known-cr3, not discovery. + * Both map PROT_READ (gva_write -> -1). `kcr3` is valid ONLY within its `epoch`. + * - On UP MEMCTX_INVALIDATED{endpoint,epoch} the holder closes its context/fd-mmap + * and waits for the next MEMCTX{epoch+1} (re-multicast with a new kcr3 and a fresh fd). + * - The fd is always `O_RDONLY` (VMSIG_MEMCTX_RDONLY set by this layer): mmap(PROT_WRITE) + * through it -> EACCES. Writing into the guest is structurally impossible on the holder + * side — it goes through the write-signaled MEMWRITE command (CMD_MEMWRITE under the + * MEMWRITE lease), never this RO mapping. */ + +/* Locator-POD flag: the region is vended read-only (always set by signaling). */ +#define VMSIG_MEMCTX_RDONLY 0x1u + +/* Address-space context locator-POD (rides in vmsig_event.inln; <=48 bytes). + * Flat self-describing encoding: nseg explicit, no offset magic. */ +typedef struct { + uint64_t kcr3; /* permanent System DirectoryTableBase (guest AS root) */ + uint64_t low; /* below-4G RAM size (PCI-hole split point; single-low open) */ + uint32_t epoch; /* VM-session epoch; kcr3 valid ONLY within it */ + uint32_t nseg; /* number of segments in the owned-payload (0 => single-low by `low`) */ + uint32_t flags; /* VMSIG_MEMCTX_RDONLY */ + uint32_t _pad; +} vmsig_memctx; /* 8+8+4+4+4+4 = 32 bytes */ + +/* One GPA->file segment (mirrors the neighbor's gpa_seg from memmodel.h, but self-contained: + * this header does NOT pull in the neighbor's contract). Rides in the owned-payload of the + * MEMCTX event when nseg>0. For a single-low image nseg==0 and the holder opens by `low`. */ +typedef struct { + uint64_t gpa; /* GPA of the window */ + uint64_t len; /* window length in bytes */ + uint64_t file_off; /* offset into the RAM-backing file */ +} vmsig_memseg; + +/* Epoch invalidation (UP VMSIG_EV_MEMCTX_INVALIDATED, in inln). */ +typedef struct { + uint32_t endpoint; + uint32_t epoch; /* new epoch; the previous one's context is invalid */ +} vmsig_memctx_inv; + +/* Decode the MEMCTX event's owned-payload into segs[] (pointer + nseg). A pure function over + * the event: no ownership, no allocations. Returns a pointer to the segments (or NULL, setting + * *out_nseg=0, if there are none — e.g. a single-low image OR socket delivery, where the + * payload does not cross the wire and the holder opens by `low`). */ +static inline const vmsig_memseg* vmsig_memctx_segs(const vmsig_event* ev, + uint32_t* out_nseg) { + const vmsig_memctx* m = (const vmsig_memctx*)ev->inln; + uint32_t n = m->nseg; + if (!n || !ev->payload.data || + ev->payload.len < (size_t)n * sizeof(vmsig_memseg)) { + if (out_nseg) *out_nseg = 0; + return NULL; + } + if (out_nseg) *out_nseg = n; + return (const vmsig_memseg*)ev->payload.data; +} + +/* ===== Registration seam adapter -> core ===== + * + * The memctx adapter registers THIS in the core via vmsig_emit.register_memctx. The core + * keeps the registration per-endpoint (retained-context) and does NOT store a copy of the + * locator: on delivery/replay it calls describe() (current locator snapshot) + share_fd() + * (fresh O_RDONLY fd). The epoch is stamped by the CORE (single source of truth); describe + * does NOT fill it. invalidate() — the core asks the adapter to re-bootstrap on an epoch + * change (the adapter re-emits MEMCTX once ready). All callbacks are called on the loop + * thread. ctx — the adapter's private context. */ +typedef struct vmsig_memctx_reg { + uint32_t endpoint; + uint32_t source; /* VMSIG_SRC_MEMCTX */ + void* ctx; /* adapter's private context */ + /* Current locator snapshot: kcr3/low/nseg/flags + segs (borrowed, owned by the + * adapter; lives across epochs). The core overwrites epoch with its own value. */ + void (*describe)(void* ctx, vmsig_memctx* out_pod, + const vmsig_memseg** out_segs, uint32_t* out_nseg); + int (*share_fd)(void* ctx); /* fresh O_RDONLY fd of the RAM region (caller closes) */ + void (*invalidate)(void* ctx, uint32_t epoch); /* re-bootstrap for the new epoch */ +} vmsig_memctx_reg; + +#endif /* VMSIG_MEMCTX_H */ diff --git a/include/vmsig_socket.h b/include/vmsig_socket.h new file mode 100644 index 0000000..1390bec --- /dev/null +++ b/include/vmsig_socket.h @@ -0,0 +1,42 @@ +#ifndef VMSIG_SOCKET_H +#define VMSIG_SOCKET_H +#include "vmsig_event.h" +#include "vmsig_control.h" /* vmsig_grant */ +#include "vmsig_core.h" /* vmsig_core */ + +/* vmsig_socket.h — out-of-process control over a unix socket (human/service poller). + * signaling LISTENS; each accepted connection is authenticated (SO_PEERCRED) and, + * per policy, receives a grant -> becomes a distinct control behind the same seam. */ + +/* Wire format: fixed-size, pointer-free — the same contract on the external + * poller. Single host (unix socket) => native byte order. Only the event's + * inline part is serialized (payload pointers do not go on the wire). */ +#define VMSIG_WIRE_MAGIC 0x47495356u /* 'VSIG' */ +#define VMSIG_WIRE_VERSION 1u +typedef struct { + uint32_t magic; + uint32_t version; + uint32_t kind; /* vmsig_kind */ + uint32_t source; /* vmsig_source */ + uint32_t dir; /* vmsig_dir */ + uint32_t prio; /* vmsig_prio */ + uint32_t endpoint; + uint32_t corr; + uint8_t inln[48]; /* inline event payload */ +} vmsig_wire; + +/* Frame <-> event codec (for external clients too). */ +void vmsig_wire_encode(vmsig_wire* w, const vmsig_event* ev); +int vmsig_wire_decode(const vmsig_wire* w, vmsig_event* ev); /* 0 ok, -1 bad magic/ver */ + +/* Admission policy: given the authenticated peer (SO_PEERCRED), return a grant. + * An empty grant (cap_mask==0 || endpoint_mask==0) => connection is rejected. */ +typedef vmsig_grant (*vmsig_socket_policy)(uint32_t uid, uint32_t pid, void* ud); + +/* Bring up a unix-socket control listener on `path` (prefix '@' => abstract socket). + * Driven by the epoll core: accept -> SO_PEERCRED -> policy -> grant -> per-conn + * control. Returns 0/-1. */ +int vmsig_socket_attach(vmsig_core* core, const char* path, + vmsig_socket_policy policy, void* ud); + +#endif /* VMSIG_SOCKET_H */ diff --git a/src/adapter/include/adapter_util.h b/src/adapter/include/adapter_util.h new file mode 100644 index 0000000..cd56b28 --- /dev/null +++ b/src/adapter/include/adapter_util.h @@ -0,0 +1,44 @@ +#ifndef VMSIG_ADAPTER_UTIL_H +#define VMSIG_ADAPTER_UTIL_H +#include + +/* adapter_util.h — shared primitive "blocking API -> completion eventfd". + * + * A bridge turning a synchronous CPU-bound / blocking neighbor call (vmie, + * vmctl) into a readiness source for the epoll core: the loop thread posts a request, a + * separate worker thread runs the blocking work and signals a completion eventfd; on it + * the loop wakes and collects the result in on_readiness. Reused by the memctx + * (off-loop bootstrap) and input adapters. */ + +typedef struct vmsig_worker vmsig_worker; + +#define VMSIG_WORK_SLOT 256 /* req/res slot size (POD, copied) */ + +/* Callback run IN the worker thread: req -> res (both POD <= VMSIG_WORK_SLOT). + * Returns 0/-1 (the code is stored alongside, see vmsig_worker_poll). Must not touch + * core structures — only compute res from req. */ +typedef int (*vmsig_work_fn)(void* user, const void* req, void* res); + +/* Create a worker pool of nthreads threads over a shared queue (nthreads>=1). vmie + * allows parallel read-only readers; for a serial channel (QMP) use 1. max_depth — the + * request-queue depth ceiling (<=0 => default): submit beyond it is rejected (-1) so an + * untrusted flood does not grow into OOM. NULL on error. */ +vmsig_worker* vmsig_worker_new(vmsig_work_fn fn, void* user, int nthreads, int max_depth); + +/* Stop the threads (join) and free. Safe on NULL. */ +void vmsig_worker_free(vmsig_worker* w); + +/* completion eventfd: the adapter registers it as a VMSIG_RDY_EVENTFD source. */ +int vmsig_worker_evfd(const vmsig_worker* w); + +/* loop thread: post a request (copied, len <= VMSIG_WORK_SLOT). 0/-1. */ +int vmsig_worker_submit(vmsig_worker* w, const void* req, size_t len); + +/* loop thread (in on_readiness): drain the completion eventfd. */ +void vmsig_worker_ack(vmsig_worker* w); + +/* loop thread: collect a ready result. 1 — written to res (+ *rc = fn code), + * 0 — empty, -1 — error. Drain in a loop until 0. */ +int vmsig_worker_poll(vmsig_worker* w, void* res, size_t cap, int* rc); + +#endif /* VMSIG_ADAPTER_UTIL_H */ diff --git a/src/adapter/input/include/input.h b/src/adapter/input/include/input.h new file mode 100644 index 0000000..1850a1e --- /dev/null +++ b/src/adapter/input/include/input.h @@ -0,0 +1,18 @@ +#ifndef VMSIG_INPUT_H +#define VMSIG_INPUT_H + +/* Private config of the input adapter (vmctl). cfg==NULL => stub mode. Armed mode + * (VMSIG_WITH_VMCTL) opens vmctl_open() and actuates for real. driver is an int so + * as not to pull vmctl.h into this header (values match VMCTL_DRIVER_*). */ +typedef struct { + int stub; + int driver; /* 0=QMP, 1=UINPUT (see VMCTL_DRIVER_*) */ + const char* qmp_path; + const char* input_bus; + int ptr_mode; +} vmsig_input_cfg; + +/* Input event codes/contract are PUBLIC: vmsig_input / vmsig_input_kind in + * include/vmsig_event.h (external control encodes them into inln). No private duplicate. */ + +#endif /* VMSIG_INPUT_H */ diff --git a/src/adapter/input/input.c b/src/adapter/input/input.c new file mode 100644 index 0000000..173acae --- /dev/null +++ b/src/adapter/input/input.c @@ -0,0 +1,230 @@ +/* input.c — input/actuator adapter for vmctl (input + power/lifecycle). + * + * Mechanism (recommended): vmctl is a blocking QMP round-trip; we run it on a + * worker thread, completion ack via a completion-eventfd. The uinput path is a + * local instantaneous write; when armed it would be done inline (see comment in submit). + * Real actuation is under VMSIG_WITH_VMCTL; otherwise the stub acks (spine without a VM). */ +#include "vmsig_adapter.h" +#include "adapter_util.h" +#include "input.h" +#include +#include +#include +#include + +#ifdef VMSIG_WITH_VMCTL +#include "vmctl.h" +#endif + +/* POD request/result of the worker. */ +typedef struct { + int cmd; /* 0 = input event, 1 = lifecycle */ + uint32_t corr; + uint32_t origin; /* initiator (addressed ACK) */ + int kind; /* vmsig_input_kind (for cmd==0) */ + int code; /* axis/btn/evdev-code */ + int value; /* abs/rel/down */ + double scroll; + int life_op; /* VMSIG_LIFE_* (powerdown/reset/wakeup/pause/resume) */ +} input_req; +typedef struct { int ok; uint32_t corr; uint32_t origin; } input_res; + +/* signaling does NOT track held state: the record of what is pressed lives in the + * ACTUATOR (vmctl); we hand it to control on request (CMD_QUERY_INPUT), release is control's decision. */ +struct vmsig_adapter { + uint32_t endpoint; + int stub; + vmsig_emit emit; + vmsig_worker* worker; + int driver; /* 0=QMP, 1=UINPUT (VMCTL_DRIVER_*); carried open->attach */ + const char* qmp_path; /* borrowed from cfg (valid through attach) */ + const char* input_bus; + int ptr_mode; +#ifdef VMSIG_WITH_VMCTL + vmctl_t* vmctl; +#endif +}; + +static int input_job(void* user, const void* reqp, void* resp) { + struct vmsig_adapter* a = user; + const input_req* rq = reqp; + input_res* rs = resp; + memset(rs, 0, sizeof *rs); + rs->corr = rq->corr; + rs->origin = rq->origin; +#ifdef VMSIG_WITH_VMCTL + if (a->vmctl) { + int r = -1; + if (rq->cmd == 0) { + vmctl_batch b; vmctl_batch_init(&b); + switch (rq->kind) { + case VMSIG_INPUT_ABS: vmctl_batch_abs(&b, rq->code, rq->value); break; + case VMSIG_INPUT_REL: vmctl_batch_rel(&b, rq->code, rq->value); break; + case VMSIG_INPUT_BTN: vmctl_batch_btn(&b, rq->code, rq->value); break; + case VMSIG_INPUT_KEY: vmctl_batch_key(&b, rq->code, rq->value); break; + case VMSIG_INPUT_SCROLL: vmctl_batch_scroll(&b, rq->code, rq->scroll); break; + default: break; + } + r = vmctl_batch_send(a->vmctl, &b); + } else { + switch (rq->life_op) { + case 0: r = vmctl_powerdown(a->vmctl); break; + case 1: r = vmctl_reset(a->vmctl); break; + case 2: r = vmctl_wakeup(a->vmctl); break; + case 3: r = vmctl_pause(a->vmctl); break; + case 4: r = vmctl_resume(a->vmctl); break; + default: break; + } + } + rs->ok = (r == 0); + return r; + } +#endif + (void)a; + rs->ok = 1; /* stub: ack without actuation */ + return 0; +} + +static vmsig_adapter* in_open(const void* cfg, uint32_t endpoint) { + const vmsig_input_cfg* c = cfg; + struct vmsig_adapter* a = calloc(1, sizeof *a); + if (!a) return NULL; + a->endpoint = endpoint; + a->stub = c ? c->stub : 1; + if (c) { /* carry the driver selection to attach (cfg not passed there) */ + a->driver = c->driver; + a->qmp_path = c->qmp_path; + a->input_bus = c->input_bus; + a->ptr_mode = c->ptr_mode; + } + return a; +} + +static int in_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg, int cap) { + if (cap < 1) return -1; + a->emit = *emit; + a->worker = vmsig_worker_new(input_job, a, 1, 64); /* QMP is a serial channel, cap 64 */ + if (!a->worker) return -1; + +#ifdef VMSIG_WITH_VMCTL + if (!a->stub) { + /* armed: build vmctl_config from the carried cfg and open the actuator. UINPUT + * (host uinput + optional virtio-input-host-pci passthrough via QMP) is the primary + * input driver; QMP input-send-event is the fallback. */ + vmctl_config vcfg; + memset(&vcfg, 0, sizeof vcfg); + vcfg.driver = (a->driver == 1) ? VMCTL_DRIVER_UINPUT : VMCTL_DRIVER_QMP; + vcfg.qmp_path = a->qmp_path; + vcfg.input_bus = a->input_bus; + vcfg.ptr_mode = a->ptr_mode; + vcfg.uinput_id = NULL; /* built-in HID identity defaults */ + a->vmctl = vmctl_open(&vcfg); + if (!a->vmctl) { vmsig_worker_free(a->worker); a->worker = NULL; return -1; } + } +#endif + + reg[0].fd = vmsig_worker_evfd(a->worker); + reg[0].epoll_events = EPOLLIN; + reg[0].shape = VMSIG_RDY_EVENTFD; + reg[0].cookie = 0; + + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_SEAM_UP; up.source = VMSIG_SRC_INPUT; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; + a->emit.emit(a->emit.token, &up); + return 1; +} + +static int in_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { + (void)cookie; (void)events; + vmsig_worker_ack(a->worker); + input_res rs; int rc; + while (vmsig_worker_poll(a->worker, &rs, sizeof rs, &rc) == 1) { + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_ACT_ACK; up.source = VMSIG_SRC_INPUT; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; + up.corr = rs.corr; up.origin = rs.origin; + up.payload.flags = VMSIG_PL_INLINE; + memcpy(up.inln, &rs, sizeof up.inln < sizeof rs ? sizeof up.inln : sizeof rs); + a->emit.emit(a->emit.token, &up); + } + return 0; +} + +static int in_submit(vmsig_adapter* a, const vmsig_event* ev) { + if (ev->kind == VMSIG_EV_CMD_QUERY_INPUT) { + /* Return what is PRESSED from the vmctl ACTUATOR's record (signaling does NOT track + * held itself). The read is read-only (no QMP round-trip) => on the loop thread; + * addressed reply to the initiator. stub without vmctl => empty set (nothing to + * actuate — nothing to hold). */ + vmsig_input_held h; + memset(&h, 0, sizeof h); +#ifdef VMSIG_WITH_VMCTL + if (a->vmctl) { + const uint32_t capn = (uint32_t)(sizeof h.ent / sizeof h.ent[0]); + unsigned char bits[VMCTL_KEYS_SNAPSHOT_BYTES]; + int n = vmctl_keys_snapshot(a->vmctl, bits, sizeof bits); + for (int code = 0; n > 0 && code <= VMCTL_KEY_CODE_MAX; code++) + if (bits[code >> 3] & (1u << (code & 7))) { + if (h.count < capn) { h.ent[h.count].kind = VMSIG_INPUT_KEY; + h.ent[h.count].code = (uint16_t)code; h.count++; } + else h.flags |= VMSIG_INPUT_HELD_TRUNC; + } + unsigned bm = vmctl_btns_snapshot(a->vmctl); + for (int b = 0; b < 8; b++) if (bm & (1u << b)) { + if (h.count < capn) { h.ent[h.count].kind = VMSIG_INPUT_BTN; + h.ent[h.count].code = (uint16_t)b; h.count++; } + else h.flags |= VMSIG_INPUT_HELD_TRUNC; + } + } +#endif + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_INPUT_HELD; up.source = VMSIG_SRC_INPUT; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; up.origin = ev->origin; + up.payload.flags = VMSIG_PL_INLINE; + memcpy(up.inln, &h, sizeof up.inln < sizeof h ? sizeof up.inln : sizeof h); + a->emit.emit(a->emit.token, &up); + return 0; + } + + input_req rq; + memset(&rq, 0, sizeof rq); + rq.corr = ev->corr; rq.origin = ev->origin; + if (ev->kind == VMSIG_EV_CMD_INPUT) { + rq.cmd = 0; + /* Decode the NEUTRAL public input contract from inln (vmsig_input). We do NOT track + * held — that is the vmctl actuator's record (returned via CMD_QUERY_INPUT). */ + vmsig_input in; + memcpy(&in, ev->inln, sizeof in <= sizeof ev->inln ? sizeof in : sizeof ev->inln); + rq.kind = (int)in.kind; + rq.code = (int)in.code; + rq.value = (int)in.value; + rq.scroll = in.scroll; + } else if (ev->kind == VMSIG_EV_CMD_LIFECYCLE) { + rq.cmd = 1; + rq.life_op = (int)(unsigned char)ev->inln[0]; + } else { + return 1; /* not for this seam */ + } + return vmsig_worker_submit(a->worker, &rq, sizeof rq) == 0 ? 0 : -1; +} + +static void in_close(vmsig_adapter* a) { + if (!a) return; + vmsig_worker_free(a->worker); +#ifdef VMSIG_WITH_VMCTL + if (a->vmctl) vmctl_close(a->vmctl); +#endif + free(a); +} + +static const vmsig_adapter_ops IN_OPS = { + .name = "input", .source = VMSIG_SRC_INPUT, .codec = VMSIG_CODEC_INPUT, + .open = in_open, .attach = in_attach, .on_readiness = in_on_ready, + .submit = in_submit, .close = in_close +}; + +const vmsig_adapter_ops* vmsig_input_ops(void) { return &IN_OPS; } diff --git a/src/adapter/linux/worker.c b/src/adapter/linux/worker.c new file mode 100644 index 0000000..557185f --- /dev/null +++ b/src/adapter/linux/worker.c @@ -0,0 +1,162 @@ +/* worker.c — bridge "blocking API -> completion eventfd" (pool of N threads). + * MPSC request/result queues under a mutex + condvar; result readiness is + * signaled via eventfd, on which the core's epoll loop wakes. N threads share one + * request queue (for vmie — parallel read-only readers; for QMP — N=1). */ +#include "adapter_util.h" +#include +#include +#include +#include +#include +#include + +typedef struct work_node { + struct work_node* next; + int rc; /* fn return code (for results) */ + size_t len; + unsigned char buf[VMSIG_WORK_SLOT]; +} work_node; + +typedef struct { work_node* head; work_node* tail; } work_q; + +struct vmsig_worker { + pthread_t* threads; + int nthreads; + pthread_mutex_t lock; + pthread_cond_t cv; + work_q req; /* loop -> workers */ + work_q res; /* workers -> loop */ + int evfd; + int stop; + int max_depth; /* cap on req-queue depth */ + int req_count; /* current req-queue depth */ + vmsig_work_fn fn; + void* user; +}; + +static void q_push(work_q* q, work_node* n) { + n->next = NULL; + if (q->tail) q->tail->next = n; else q->head = n; + q->tail = n; +} +static work_node* q_pop(work_q* q) { + work_node* n = q->head; + if (!n) return NULL; + q->head = n->next; + if (!q->head) q->tail = NULL; + return n; +} +static void q_drain(work_q* q) { + work_node* n = q->head; + while (n) { work_node* nx = n->next; free(n); n = nx; } + q->head = q->tail = NULL; +} + +static void* worker_main(void* arg) { + vmsig_worker* w = arg; + for (;;) { + pthread_mutex_lock(&w->lock); + while (!w->stop && !w->req.head) pthread_cond_wait(&w->cv, &w->lock); + /* On stop we DRAIN the queue: run the remaining requests so that submitted + * work is not silently lost (matters for jobs carrying resource ownership). + * We exit only when stop AND the queue is empty. */ + if (w->stop && !w->req.head) { pthread_mutex_unlock(&w->lock); break; } + work_node* rq = q_pop(&w->req); + if (rq) w->req_count--; + pthread_mutex_unlock(&w->lock); + if (!rq) continue; + + work_node* rs = calloc(1, sizeof *rs); + if (rs) { + rs->rc = w->fn ? w->fn(w->user, rq->buf, rs->buf) : -1; + rs->len = VMSIG_WORK_SLOT; + pthread_mutex_lock(&w->lock); + q_push(&w->res, rs); + pthread_mutex_unlock(&w->lock); + uint64_t one = 1; + ssize_t r = write(w->evfd, &one, sizeof one); + (void)r; + } + free(rq); + } + return NULL; +} + +vmsig_worker* vmsig_worker_new(vmsig_work_fn fn, void* user, int nthreads, int max_depth) { + if (nthreads < 1) nthreads = 1; + vmsig_worker* w = calloc(1, sizeof *w); + if (!w) return NULL; + w->fn = fn; w->user = user; w->evfd = -1; + w->max_depth = max_depth > 0 ? max_depth : 512; + w->threads = calloc((size_t)nthreads, sizeof *w->threads); + if (!w->threads) { free(w); return NULL; } + w->evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (w->evfd < 0) { free(w->threads); free(w); return NULL; } + if (pthread_mutex_init(&w->lock, NULL) != 0) { close(w->evfd); free(w->threads); free(w); return NULL; } + if (pthread_cond_init(&w->cv, NULL) != 0) { + pthread_mutex_destroy(&w->lock); close(w->evfd); free(w->threads); free(w); return NULL; + } + for (int i = 0; i < nthreads; i++) { + if (pthread_create(&w->threads[i], NULL, worker_main, w) != 0) break; + w->nthreads++; + } + if (w->nthreads == 0) { + pthread_cond_destroy(&w->cv); pthread_mutex_destroy(&w->lock); + close(w->evfd); free(w->threads); free(w); return NULL; + } + return w; +} + +void vmsig_worker_free(vmsig_worker* w) { + if (!w) return; + pthread_mutex_lock(&w->lock); + w->stop = 1; + pthread_cond_broadcast(&w->cv); + pthread_mutex_unlock(&w->lock); + for (int i = 0; i < w->nthreads; i++) pthread_join(w->threads[i], NULL); + q_drain(&w->req); + q_drain(&w->res); + pthread_cond_destroy(&w->cv); + pthread_mutex_destroy(&w->lock); + if (w->evfd >= 0) close(w->evfd); + free(w->threads); + free(w); +} + +int vmsig_worker_evfd(const vmsig_worker* w) { return w ? w->evfd : -1; } + +int vmsig_worker_submit(vmsig_worker* w, const void* req, size_t len) { + if (!w || len > VMSIG_WORK_SLOT) return -1; + pthread_mutex_lock(&w->lock); + if (w->req_count >= w->max_depth) { /* queue cap: reject flooding */ + pthread_mutex_unlock(&w->lock); + return -1; + } + work_node* n = calloc(1, sizeof *n); + if (!n) { pthread_mutex_unlock(&w->lock); return -1; } + if (req && len) memcpy(n->buf, req, len); + n->len = len; + q_push(&w->req, n); + w->req_count++; + pthread_cond_signal(&w->cv); + pthread_mutex_unlock(&w->lock); + return 0; +} + +void vmsig_worker_ack(vmsig_worker* w) { + if (!w) return; + uint64_t v; + while (read(w->evfd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain */ } +} + +int vmsig_worker_poll(vmsig_worker* w, void* res, size_t cap, int* rc) { + if (!w) return -1; + pthread_mutex_lock(&w->lock); + work_node* n = q_pop(&w->res); + pthread_mutex_unlock(&w->lock); + if (!n) return 0; + if (res && cap) memcpy(res, n->buf, cap < n->len ? cap : n->len); + if (rc) *rc = n->rc; + free(n); + return 1; +} diff --git a/src/adapter/memctx/include/memctx.h b/src/adapter/memctx/include/memctx.h new file mode 100644 index 0000000..28ad5b9 --- /dev/null +++ b/src/adapter/memctx/include/memctx.h @@ -0,0 +1,20 @@ +#ifndef VMSIG_MEMCTX_CFG_H +#define VMSIG_MEMCTX_CFG_H +#include + +/* Private config of the memctx adapter (vmie). Passed as opaque to open(); NOT + * public (layout per reference: src//include/). cfg==NULL => stub. */ +typedef struct { + int stub; /* 1 => synthetic kcr3/RO-fd (spine without a VM) */ + const char* ram_path; /* armed: path to guest RAM backing (NOT published outward) */ + uint64_t low; /* below-4G split (vmie_win32_open / locator.low) */ + int ro_fd; /* >=0 => infra supplied a pre-sealed RO-fd (policy); */ + /* <0 => default: open(ram_path, O_RDONLY) / stub-memfd */ +} vmsig_memctx_cfg; + +/* Max SRC bytes per atomic gva_write (bounds the worker POD slot; mc_req header + src + * must stay <= VMSIG_WORK_SLOT). Private to the adapter (an executor bound), NOT part of + * the neutral control contract — control only needs VMSIG_MEMWRITE_INLINE for inline SRC. */ +#define VMSIG_MEMWRITE_MAX 192u + +#endif /* VMSIG_MEMCTX_CFG_H */ diff --git a/src/adapter/memctx/memctx.c b/src/adapter/memctx/memctx.c new file mode 100644 index 0000000..089a1b9 --- /dev/null +++ b/src/adapter/memctx/memctx.c @@ -0,0 +1,407 @@ +/* memctx.c — vmie sensor adapter: vends ONE coherent guest address-space context — + * the permanent System DirectoryTableBase (`kcr3`) PAIRED with a RAM-region locator + * and a pre-opened O_RDONLY fd. This is NOT perception and NOT semantics: signaling + * multicasts the datum + RO-fd, while the holder (an S-lib / any control) opens ITS OWN + * read-only vmie_mem from the fd and does gva_read/scan/pmap itself. + * + * Cold bring-up (host_bootstrap) is CPU-bound and blocking, so it runs on an off-loop + * worker; the loop thread only assembles the locator on the completion-eventfd and emits + * the MEMCTX trigger. The epoch is stamped by the CORE (retained-context); on an epoch + * change the core calls reg.invalidate, the adapter re-bootstraps and re-emits MEMCTX. + * + * RO outward is physical: O_RDONLY fd => mmap(PROT_WRITE) -> EACCES, so a write into the + * guest on the holder side is structurally impossible. stub mode (without VMSIG_WITH_VMIE + * or ram_path==NULL) synthesizes a kcr3 and a genuinely RO-mappable fd (memfd + seal) — + * the seam is provable without a VM. */ +#define _GNU_SOURCE +#include "vmsig_adapter.h" +#include "memctx.h" +#include "adapter_util.h" /* vmsig_worker (off-loop bootstrap) */ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef VMSIG_WITH_VMIE +#include "win32.h" /* vmie_win32_open/host_bootstrap/proc_list/close */ +#endif + +/* memfd_create / seal — ABI fallbacks for old glibc/kernel (stub RO-fd backing). */ +#ifndef MFD_CLOEXEC +#include +#include +static int memfd_create(const char* name, unsigned int flags) { + return (int)syscall(SYS_memfd_create, name, flags); +} +#endif +#ifndef MFD_ALLOW_SEALING +#define MFD_ALLOW_SEALING 0x0002U +#endif +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (1024 + 9) +#define F_SEAL_SHRINK 0x0002 +#define F_SEAL_GROW 0x0004 +#endif +#ifndef F_SEAL_FUTURE_WRITE +#define F_SEAL_FUTURE_WRITE 0x0010 /* kernel 5.1+: forbid future writable mappings */ +#endif + +#define MC_STUB_SIZE 0x10000u /* 64 KB of synthetic RAM image (stub) */ +#define MC_MAX_SEG 8 +#define MC_WORKER_DEPTH 16 /* one off-loop thread: rare bootstrap + writes */ + +enum { MC_JOB_BOOTSTRAP = 0, MC_JOB_WRITE = 1 }; + +/* worker req/res (POD <= VMSIG_WORK_SLOT). One off-loop worker runs BOTH the cold + * bootstrap and the atomic writes (FIFO serializes a write against the close-on-rebootstrap). + * boot_count drives the stub kcr3 (changes per epoch); the real guest kcr3 does NOT depend + * on it (armed reads the System DTB). MC_JOB_WRITE copies SRC off-loop into req.src. */ +typedef struct { + uint32_t op; /* MC_JOB_* */ + uint32_t boot_count; /* MC_JOB_BOOTSTRAP */ + /* --- MC_JOB_WRITE --- */ + uint64_t gva; + uint32_t len; + uint32_t corr; + uint32_t origin; + uint8_t src[VMSIG_MEMWRITE_MAX]; /* SRC bytes copied off-loop (gva_write reads this) */ +} mc_req; +typedef struct { + uint32_t op; /* echoes the job type so on_ready demuxes */ + int ok; /* MC_JOB_WRITE result */ + uint32_t corr; + uint32_t origin; + uint64_t kcr3; /* MC_JOB_BOOTSTRAP result */ +} mc_res; + +struct vmsig_adapter { + uint32_t endpoint; + int stub; + const char* ram_path; /* armed: RAM-backing path (NOT published outward) */ + uint64_t low; + int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (policy); <0 => default */ + vmsig_emit emit; + int registered; /* register_memctx already called */ + vmsig_worker* worker; /* off-loop bootstrap + atomic writes */ + uint32_t boot_count; /* incremented on each (re-)bootstrap */ + +#ifdef VMSIG_WITH_VMIE + vmie_win32* win; /* held RW handle across the epoch (kcr3 source + gva_write target) */ + vmie_mem* mem; /* vmie_win32_mem(win); borrowed, valid until vmie_win32_close */ +#endif + uint64_t kcr3; /* current System DTB (also published in cur_pod.kcr3) */ + + /* persistent locator: owned by the loop thread; worker only yields kcr3 into scratch. */ + int have_ctx; + vmsig_memctx cur_pod; /* kcr3/low/nseg/flags (epoch stamped by the core) */ + vmsig_memseg cur_segs[MC_MAX_SEG]; + uint32_t cur_nseg; + + int stub_fd; /* stub: memfd of synth RAM (+seal); share_fd reopens it */ +}; + +/* fwd: MEMWRITE completion ACK (defined below mc_submit; used in mc_on_ready demux). */ +static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin); + +/* ---- stub RO-fd: memfd + deterministic contents + seal of future writes ---- */ +static int mc_make_stub_fd(uint32_t size) { + int fd = memfd_create("vmsig_memctx", MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd < 0) fd = memfd_create("vmsig_memctx", MFD_CLOEXEC); + if (fd < 0) return -1; + if (ftruncate(fd, (off_t)size) != 0) { close(fd); return -1; } + /* deterministic contents via a temporary RW mapping BEFORE the seal */ + uint8_t* p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p != MAP_FAILED) { + for (uint32_t i = 0; i < size; i++) p[i] = (uint8_t)(i & 0xFFu); + munmap(p, size); + } + /* FUTURE_WRITE: even if the holder reopens the fd as O_RDWR, it gets no writable mapping. + * best-effort (kernel 5.1+); on older kernels only the O_RDONLY fd protects. */ + if (fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_FUTURE_WRITE) != 0) + (void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW); + return fd; +} + +#ifdef VMSIG_WITH_VMIE +/* armed bring-up: open RAM (RW is vmie's internal concern), host_bootstrap, extract the + * permanent System DTB as the System process cr3 (kcr3 — the root of the guest AS). The RW + * handle is HELD across the epoch (kcr3 source + gva_write target); ONLY the RO-fd (share_fd) + * leaves outward — write goes through this command plane, never a writable mmap. Runs on the + * off-loop worker; a stale handle from a prior epoch is dropped first (serialized FIFO with + * in-flight writes). */ +static int mc_bootstrap_armed(struct vmsig_adapter* a, uint64_t* out_kcr3) { + if (a->win) { vmie_win32_close(a->win); a->win = NULL; a->mem = NULL; } /* drop stale epoch handle */ + vmie_win32* v = vmie_win32_open(a->ram_path, a->low); + if (!v) return -1; + if (host_bootstrap(v) != 0) { vmie_win32_close(v); return -1; } + process procs[16]; + int n = proc_list(v, 0, procs, 16); + uint64_t kcr3 = 0; + for (int i = 0; i < n && i < 16; i++) + if (!strcmp(procs[i].name, "System")) { kcr3 = procs[i].cr3; break; } + if (!kcr3) { vmie_win32_close(v); return -1; } + a->win = v; /* HOLD: RW handle lives across the epoch */ + a->mem = vmie_win32_mem(v); /* borrowed; valid until vmie_win32_close(v) */ + a->kcr3 = kcr3; + *out_kcr3 = kcr3; + return 0; +} +#endif + +/* ---- worker job: cold bring-up OR atomic write, off-loop ----------------- * + * Demultiplexed by rq->op. BOTH run on the SAME single worker thread, so a write on the + * held handle never races the close-on-rebootstrap (FIFO). The job MUST NOT touch core + * structures — it only reads a->mem/a->kcr3 (stable between re-bootstraps on this thread). */ +static int mc_job(void* user, const void* req, void* res) { + struct vmsig_adapter* a = user; + const mc_req* rq = req; + mc_res* rs = res; + memset(rs, 0, sizeof *rs); + rs->op = rq->op; + + if (rq->op == MC_JOB_WRITE) { + rs->corr = rq->corr; rs->origin = rq->origin; + if (a->stub) { rs->ok = 1; return 0; } /* stub: ack without actuation */ +#ifdef VMSIG_WITH_VMIE + /* a->mem is NULL until a bootstrap has succeeded (or after one failed and cleared it): + * the guard turns that into an ok=0 ACK (observable to the initiator), not a crash. */ + rs->ok = (a->mem && gva_write(a->mem, (uintptr_t)a->kcr3, (uintptr_t)rq->gva, + rq->src, rq->len) == 0); + return rs->ok ? 0 : -1; +#else + rs->ok = 0; + return -1; /* armed without the build flag: write impossible */ +#endif + } + + /* MC_JOB_BOOTSTRAP */ + if (a->stub) { + rs->kcr3 = 0xC0DE0000ull + (uint64_t)rq->boot_count * 0x1000ull; /* changes per epoch */ + return 0; + } +#ifdef VMSIG_WITH_VMIE + uint64_t kcr3 = 0; + if (mc_bootstrap_armed(a, &kcr3) != 0) return -1; + rs->kcr3 = kcr3; + return 0; +#else + return -1; /* armed without the build flag: bootstrap impossible -> ERROR */ +#endif +} + +static void mc_kick_bootstrap(struct vmsig_adapter* a) { + a->boot_count++; + mc_req rq; + memset(&rq, 0, sizeof rq); + rq.op = MC_JOB_BOOTSTRAP; rq.boot_count = a->boot_count; + (void)vmsig_worker_submit(a->worker, &rq, sizeof rq); /* full => drop (rare) */ +} + +/* ---- reg hooks (vmsig_memctx_reg.ctx = a; called by the core on the loop thread) ---- */ +static void mc_reg_describe(void* ctx, vmsig_memctx* out_pod, + const vmsig_memseg** out_segs, uint32_t* out_nseg) { + struct vmsig_adapter* a = ctx; + *out_pod = a->cur_pod; /* kcr3/low/nseg/flags; the core overwrites the epoch */ + *out_segs = a->cur_segs; + *out_nseg = a->cur_nseg; +} + +static int mc_reg_share_fd(void* ctx) { + struct vmsig_adapter* a = ctx; + if (a->cfg_ro_fd >= 0) + return fcntl(a->cfg_ro_fd, F_DUPFD_CLOEXEC, 0); /* infra-sealed RO-fd: dup */ + if (a->stub) { + if (a->stub_fd < 0) return -1; + char path[64]; + snprintf(path, sizeof path, "/proc/self/fd/%d", a->stub_fd); + return open(path, O_RDONLY | O_CLOEXEC); /* fresh O_RDONLY on the backing */ + } + if (!a->ram_path) return -1; + return open(a->ram_path, O_RDONLY | O_CLOEXEC); /* armed default */ +} + +static void mc_reg_invalidate(void* ctx, uint32_t epoch) { + struct vmsig_adapter* a = ctx; + (void)epoch; /* the core owns the epoch; the adapter must re-bootstrap */ + a->have_ctx = 0; /* the previous context is invalid */ + mc_kick_bootstrap(a); /* off-loop; on_ready re-emits MEMCTX (new epoch) */ +} + +/* ---- vtable ---- */ +static vmsig_adapter* mc_open(const void* cfg, uint32_t endpoint) { + const vmsig_memctx_cfg* c = cfg; + struct vmsig_adapter* a = calloc(1, sizeof *a); + if (!a) return NULL; + a->endpoint = endpoint; + a->stub = c ? c->stub : 1; + a->ram_path = c ? c->ram_path : NULL; + a->low = c ? c->low : 0; + a->cfg_ro_fd = (c && c->ro_fd >= 0) ? c->ro_fd : -1; + if (!a->ram_path && a->cfg_ro_fd < 0) a->stub = 1; /* no path/fd => stub */ + a->stub_fd = -1; + return a; +} + +static int mc_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg, int cap) { + if (cap < 1) return -1; + a->emit = *emit; + + a->worker = vmsig_worker_new(mc_job, a, 1, MC_WORKER_DEPTH); + if (!a->worker) return -1; + + if (a->stub && a->cfg_ro_fd < 0) { + a->stub_fd = mc_make_stub_fd(MC_STUB_SIZE); + if (a->stub_fd < 0) { vmsig_worker_free(a->worker); a->worker = NULL; return -1; } + } + + /* worker completion-eventfd as the readiness source (cookie=0). */ + reg[0].fd = vmsig_worker_evfd(a->worker); + reg[0].epoll_events = EPOLLIN; + reg[0].shape = VMSIG_RDY_EVENTFD; + reg[0].cookie = 0; + + /* register the reg BEFORE the first bootstrap: the core slot gets the hooks. describe + * is not called until the slot is valid (which only happens after the first MEMCTX). */ + if (a->emit.register_memctx) { + vmsig_memctx_reg r; + memset(&r, 0, sizeof r); + r.endpoint = a->endpoint; + r.source = VMSIG_SRC_MEMCTX; + r.ctx = a; + r.describe = mc_reg_describe; + r.share_fd = mc_reg_share_fd; + r.invalidate = mc_reg_invalidate; + if (a->emit.register_memctx(a->emit.token, &r) == 0) a->registered = 1; + } + + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_SEAM_UP; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; + a->emit.emit(a->emit.token, &up); + + mc_kick_bootstrap(a); /* first bootstrap off-loop; assemble the locator on completion */ + return 1; +} + +static int mc_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { + (void)cookie; (void)events; + vmsig_worker_ack(a->worker); + mc_res rs; + int rc; + while (vmsig_worker_poll(a->worker, &rs, sizeof rs, &rc) == 1) { + if (rs.op == MC_JOB_WRITE) { + /* atomic write completed: addressed ACT_ACK to the initiator. */ + mc_memwrite_ack(a, rs.ok && rc == 0, rs.corr, rs.origin); + continue; + } + if (rc != 0) { + /* bootstrap failed: ERROR (source MEMCTX); do NOT publish an invalid kcr3. */ + vmsig_event er; + memset(&er, 0, sizeof er); + er.kind = VMSIG_EV_ERROR; er.source = VMSIG_SRC_MEMCTX; er.dir = VMSIG_DIR_UP; + er.prio = VMSIG_PRIO_URGENT; er.endpoint = a->endpoint; + a->emit.emit(a->emit.token, &er); + continue; + } + /* assemble the locator on the loop thread from rs.kcr3. a->kcr3 is the gva_write + * TARGET and is owned SOLELY by the worker thread (set in mc_bootstrap_armed, read by + * MC_JOB_WRITE — same thread, FIFO happens-before); the loop must NOT also write it, or + * an in-flight write at line ~170 would race it. cur_pod.kcr3 is loop-only (delivery). */ + memset(&a->cur_pod, 0, sizeof a->cur_pod); + a->cur_pod.kcr3 = rs.kcr3; + a->cur_pod.low = a->low ? a->low : MC_STUB_SIZE; + a->cur_pod.flags = VMSIG_MEMCTX_RDONLY; + a->cur_nseg = 1; /* single-low identity (gpa 0 .. low) */ + a->cur_segs[0].gpa = 0; + a->cur_segs[0].len = a->cur_pod.low; + a->cur_segs[0].file_off = 0; + a->cur_pod.nseg = a->cur_nseg; + a->have_ctx = 1; + + /* emit the MEMCTX trigger: the core authoritatively re-describes + stamps the epoch. */ + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_MEMCTX; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; + memcpy(up.inln, &a->cur_pod, sizeof a->cur_pod); + a->emit.emit(a->emit.token, &up); + } + return 0; +} + +/* Emit an addressed ACT_ACK for a MEMWRITE (source MEMCTX, to the initiator). inln carries + * {ok,corr,origin} (same shape as the input adapter's ACK), so control reads ok at offset 0. + * ok=0 covers extent-deny / no-SRC / queue-full / write failure (default-deny, observable). */ +static void mc_memwrite_ack(struct vmsig_adapter* a, int ok, uint32_t corr, uint32_t origin) { + struct { int ok; uint32_t corr; uint32_t origin; } body = { ok, corr, origin }; + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_ACT_ACK; up.source = VMSIG_SRC_MEMCTX; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; + up.corr = corr; up.origin = origin; + up.payload.flags = VMSIG_PL_INLINE; + memcpy(up.inln, &body, sizeof body); + a->emit.emit(a->emit.token, &up); +} + +/* DOWN MEMWRITE handler: validate extent, copy SRC off-loop, submit the atomic gva_write to + * the worker. Default-deny: any invalid path (no SRC flag, len out of bounds, short payload, + * queue full) ACKs ok=0 and does NOT actuate. The completion ACK for a queued write arrives + * via mc_on_ready. Returns 0 when the event is consumed by this seam, 1 when it is not ours. */ +static int mc_submit(vmsig_adapter* a, const vmsig_event* ev) { + if (ev->kind != VMSIG_EV_CMD_MEMWRITE) return 1; /* not for this seam */ + + const vmsig_memwrite* mw = (const vmsig_memwrite*)ev->inln; + uint32_t len = mw->len; + if (len == 0 || len > VMSIG_MEMWRITE_MAX) { /* extent: bounded */ + mc_memwrite_ack(a, 0, ev->corr, ev->origin); + return 0; + } + mc_req rq; memset(&rq, 0, sizeof rq); + rq.op = MC_JOB_WRITE; rq.gva = mw->gva; rq.len = len; + rq.corr = ev->corr; rq.origin = ev->origin; + + /* copy SRC into the worker req (off-loop gva_write reads from rq.src). */ + if (mw->flags & VMSIG_MW_SRC_INLINE) { + if (len > VMSIG_MEMWRITE_INLINE) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; } + memcpy(rq.src, ev->inln + sizeof *mw, len); /* inln tail after the 16-byte header */ + } else if (mw->flags & VMSIG_MW_SRC_PAYLOAD) { + if (!ev->payload.data || ev->payload.len < len) { mc_memwrite_ack(a, 0, ev->corr, ev->origin); return 0; } + memcpy(rq.src, ev->payload.data, len); /* in-proc borrowed payload */ + } else { + mc_memwrite_ack(a, 0, ev->corr, ev->origin); /* no SRC flag */ + return 0; + } + + if (vmsig_worker_submit(a->worker, &rq, sizeof rq) != 0) { + mc_memwrite_ack(a, 0, ev->corr, ev->origin); /* queue full -> ACK err */ + return -1; + } + return 0; /* completion ACK arrives via mc_on_ready */ +} + +static void mc_close(vmsig_adapter* a) { + if (!a) return; + if (a->registered && a->emit.unregister_memctx) + a->emit.unregister_memctx(a->emit.token, a->endpoint); + if (a->worker) vmsig_worker_free(a->worker); /* join: bootstrap + write jobs finished */ +#ifdef VMSIG_WITH_VMIE + if (a->win) vmie_win32_close(a->win); /* AFTER worker join: no in-flight gva_write */ +#endif + if (a->stub_fd >= 0) close(a->stub_fd); + /* cfg_ro_fd belongs to the infrastructure (the open caller) — do NOT close it. */ + free(a); +} + +static const vmsig_adapter_ops MC_OPS = { + .name = "memctx", .source = VMSIG_SRC_MEMCTX, .codec = VMSIG_CODEC_MEMCTX, + .open = mc_open, .attach = mc_attach, .on_readiness = mc_on_ready, + .submit = mc_submit, .close = mc_close +}; + +const vmsig_adapter_ops* vmsig_memctx_ops(void) { return &MC_OPS; } diff --git a/src/adapter/vmhost/include/vmhost.h b/src/adapter/vmhost/include/vmhost.h new file mode 100644 index 0000000..6cea5e3 --- /dev/null +++ b/src/adapter/vmhost/include/vmhost.h @@ -0,0 +1,13 @@ +#ifndef VMSIG_VMHOST_H +#define VMSIG_VMHOST_H + +/* Private config of the vmhost adapter (signaling's own QMP client). + * cfg==NULL or no qmp_path => stub mode (synthetic events, no QEMU). + * qmp_path given => armed: connect to QEMU's QMP socket ('@' prefix = abstract). + * No build flag needed — the client depends only on POSIX and its own code. */ +typedef struct { + int stub; + const char* qmp_path; +} vmsig_vmhost_cfg; + +#endif /* VMSIG_VMHOST_H */ diff --git a/src/adapter/vmhost/vmhost.c b/src/adapter/vmhost/vmhost.c new file mode 100644 index 0000000..1465f2a --- /dev/null +++ b/src/adapter/vmhost/vmhost.c @@ -0,0 +1,313 @@ +/* vmhost.c — QEMU/QMP host-plane: signaling's OWN layer for observing the VM + * and its basic control. Not a wrapper over a neighbor repo — an own QMP client; + * depends only on POSIX, so it is always functional (no build flag). + * + * This is the first truly epoll-native source: the QMP socket (VMSIG_RDY_FD) lives + * directly in the loop, non-blocking, async events. Up: QMP events -> VM_LIFECYCLE + * (broadcast), EOF -> SEAM_DOWN. Down: CMD_VM -> QMP command with id correlation, + * reply addressed to the initiator. stub mode (no QEMU) synthesizes events/replies. */ +#define _GNU_SOURCE +#include "vmsig_adapter.h" +#include "vmhost.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VMHOST_BUF 4096 +#define VMHOST_STUB_MS 200 +#define VMHOST_MAX_PENDING 64 + +enum { ST_STUB = 0, ST_CONNECTING, ST_NEGOTIATING, ST_READY, ST_DEAD }; + +typedef struct { uint32_t id, origin, corr; uint8_t op; int used; } pend_ent; + +struct vmsig_adapter { + uint32_t endpoint; + int stub; + const char* qmp_path; + vmsig_emit emit; + int fd; /* QMP socket (armed) or timerfd (stub) */ + int st; + uint32_t cur; /* current synthetic state (stub) */ + char buf[VMHOST_BUF]; + size_t buflen; + uint32_t next_id; + pend_ent pend[VMHOST_MAX_PENDING]; +}; + +/* ---- minimal QMP line parse (top-level keys only; full JSON — deferred) ---- */ +static int jstr(const char* line, const char* key, char* out, size_t cap) { + const char* p = strstr(line, key); + if (!p) return 0; + p += strlen(key); + while (*p == ' ' || *p == '\t' || *p == ':') p++; + if (*p != '"') return 0; + p++; + size_t i = 0; + while (*p && *p != '"' && i + 1 < cap) out[i++] = *p++; + out[i] = 0; + return 1; +} +static long jnum(const char* line, const char* key) { + const char* p = strstr(line, key); + if (!p) return -1; + p += strlen(key); + while (*p == ' ' || *p == '\t' || *p == ':') p++; + if (*p < '0' || *p > '9') return -1; + return strtol(p, NULL, 10); +} +static uint32_t ev_state(const char* n) { + if (!strcmp(n, "RESUME")) return VMSIG_VM_RUNNING; + if (!strcmp(n, "STOP")) return VMSIG_VM_PAUSED; + if (!strcmp(n, "SHUTDOWN")) return VMSIG_VM_SHUTDOWN; + if (!strcmp(n, "RESET")) return VMSIG_VM_RESET; + if (!strcmp(n, "POWERDOWN")) return VMSIG_VM_POWERDOWN; + if (!strcmp(n, "GUEST_PANICKED")) return VMSIG_VM_CRASHED; + return VMSIG_VM_UNKNOWN; +} +static uint32_t status_state(const char* s) { + if (!strcmp(s, "running")) return VMSIG_VM_RUNNING; + if (!strcmp(s, "paused")) return VMSIG_VM_PAUSED; + if (!strcmp(s, "shutdown")) return VMSIG_VM_SHUTDOWN; + return VMSIG_VM_UNKNOWN; +} +static const char* op_qmp(uint32_t op) { + switch (op) { + case VMSIG_VMOP_QUERY: return "query-status"; + case VMSIG_VMOP_CONT: return "cont"; + case VMSIG_VMOP_STOP: return "stop"; + case VMSIG_VMOP_RESET: return "system_reset"; + case VMSIG_VMOP_POWERDOWN: return "system_powerdown"; + case VMSIG_VMOP_QUIT: return "quit"; + default: return NULL; + } +} + +static pend_ent* pend_alloc(struct vmsig_adapter* a) { + for (int i = 0; i < VMHOST_MAX_PENDING; i++) if (!a->pend[i].used) return &a->pend[i]; + return NULL; +} +static pend_ent* pend_find(struct vmsig_adapter* a, uint32_t id) { + for (int i = 0; i < VMHOST_MAX_PENDING; i++) + if (a->pend[i].used && a->pend[i].id == id) return &a->pend[i]; + return NULL; +} + +/* ---- emission of neutral UP events ---- */ +static void emit_vm(struct vmsig_adapter* a, uint32_t state, uint32_t origin, uint32_t corr) { + vmsig_vm_state vs = { state, 0 }; + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_VM_LIFECYCLE; up.source = VMSIG_SRC_VMHOST; up.dir = VMSIG_DIR_UP; + up.prio = (state == VMSIG_VM_RUNNING || state == VMSIG_VM_PAUSED) + ? VMSIG_PRIO_NORMAL : VMSIG_PRIO_URGENT; + up.endpoint = a->endpoint; up.origin = origin; up.corr = corr; + up.payload.flags = VMSIG_PL_INLINE; + memcpy(up.inln, &vs, sizeof vs); + a->emit.emit(a->emit.token, &up); +} +static void emit_seam(struct vmsig_adapter* a, vmsig_kind k) { + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = k; up.source = VMSIG_SRC_VMHOST; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_URGENT; up.endpoint = a->endpoint; + a->emit.emit(a->emit.token, &up); +} +static void emit_ack(struct vmsig_adapter* a, uint32_t origin, uint32_t corr, int ok) { + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_ACT_ACK; up.source = VMSIG_SRC_VMHOST; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_NORMAL; up.endpoint = a->endpoint; up.origin = origin; up.corr = corr; + up.payload.flags = VMSIG_PL_INLINE; + up.inln[0] = (uint8_t)(ok ? 1 : 0); + a->emit.emit(a->emit.token, &up); +} + +/* ---- armed: handle one QMP line ---- */ +static void handle_line(struct vmsig_adapter* a, const char* line) { + switch (a->st) { + case ST_CONNECTING: + if (strstr(line, "\"QMP\"")) { /* greeting -> negotiate capabilities */ + static const char cap[] = "{\"execute\":\"qmp_capabilities\"}\r\n"; + ssize_t r = write(a->fd, cap, sizeof cap - 1); (void)r; + a->st = ST_NEGOTIATING; + } + break; + case ST_NEGOTIATING: + if (strstr(line, "\"return\"")) { a->st = ST_READY; emit_seam(a, VMSIG_EV_SEAM_UP); } + break; + case ST_READY: + if (strstr(line, "\"event\"")) { + char name[64]; + if (jstr(line, "\"event\"", name, sizeof name)) { + uint32_t s = ev_state(name); + if (s != VMSIG_VM_UNKNOWN) emit_vm(a, s, 0, 0); /* broadcast */ + } + } else if (strstr(line, "\"return\"") || strstr(line, "\"error\"")) { + long id = jnum(line, "\"id\""); + pend_ent* p = id >= 0 ? pend_find(a, (uint32_t)id) : NULL; + if (p) { + if (p->op == VMSIG_VMOP_QUERY && strstr(line, "\"return\"")) { + char stbuf[32]; uint32_t s = VMSIG_VM_UNKNOWN; + if (jstr(line, "\"status\"", stbuf, sizeof stbuf)) s = status_state(stbuf); + emit_vm(a, s, p->origin, p->corr); /* addressed reply */ + } else { + emit_ack(a, p->origin, p->corr, strstr(line, "\"return\"") != NULL); + } + p->used = 0; + } + } + break; + default: break; + } +} + +static void armed_dead(struct vmsig_adapter* a) { + emit_seam(a, VMSIG_EV_SEAM_DOWN); /* VM transport died */ + if (a->fd >= 0) { close(a->fd); a->fd = -1; } /* close removes the fd from epoll */ + a->st = ST_DEAD; +} + +/* ---- vtable ---- */ +static vmsig_adapter* vh_open(const void* cfg, uint32_t endpoint) { + const vmsig_vmhost_cfg* c = cfg; + struct vmsig_adapter* a = calloc(1, sizeof *a); + if (!a) return NULL; + a->endpoint = endpoint; + a->qmp_path = (c && c->qmp_path && c->qmp_path[0]) ? c->qmp_path : NULL; + a->stub = (a->qmp_path == NULL); /* path given => armed, otherwise stub */ + a->fd = -1; + a->cur = VMSIG_VM_RUNNING; + return a; +} + +static int vh_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg, int cap) { + if (cap < 1) return -1; + a->emit = *emit; + + if (a->stub) { + a->fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); + if (a->fd < 0) return -1; + struct itimerspec its; + memset(&its, 0, sizeof its); + its.it_interval.tv_sec = VMHOST_STUB_MS / 1000u; + its.it_interval.tv_nsec = (long)(VMHOST_STUB_MS % 1000u) * 1000000L; + its.it_value = its.it_interval; + if (timerfd_settime(a->fd, 0, &its, NULL) < 0) { close(a->fd); a->fd = -1; return -1; } + a->st = ST_STUB; + reg[0].fd = a->fd; reg[0].epoll_events = EPOLLIN; + reg[0].shape = VMSIG_RDY_TIMERFD; reg[0].cookie = 0; + emit_seam(a, VMSIG_EV_SEAM_UP); + return 1; + } + + /* armed: connect to QEMU's QMP socket */ + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0); + if (fd < 0) return -1; + struct sockaddr_un addr; + memset(&addr, 0, sizeof addr); + addr.sun_family = AF_UNIX; + socklen_t alen; + size_t n = strlen(a->qmp_path); + if (a->qmp_path[0] == '@') { + if (n > sizeof addr.sun_path) { close(fd); return -1; } + addr.sun_path[0] = 0; + memcpy(addr.sun_path + 1, a->qmp_path + 1, n - 1); + alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n); + } else { + if (n >= sizeof addr.sun_path) { close(fd); return -1; } + memcpy(addr.sun_path, a->qmp_path, n); + alen = (socklen_t)sizeof addr; + } + if (connect(fd, (struct sockaddr*)&addr, alen) < 0 && errno != EINPROGRESS) { + close(fd); return -1; + } + a->fd = fd; a->st = ST_CONNECTING; + reg[0].fd = fd; reg[0].epoll_events = EPOLLIN; + reg[0].shape = VMSIG_RDY_FD; reg[0].cookie = 0; + /* SEAM_UP is emitted upon reaching READY (after qmp_capabilities) */ + return 1; +} + +static int vh_on_ready(vmsig_adapter* a, uint32_t cookie, uint32_t events) { + (void)cookie; (void)events; + + if (a->stub) { + uint64_t ticks; + while (read(a->fd, &ticks, sizeof ticks) == (ssize_t)sizeof ticks) { /* drain */ } + a->cur = (a->cur == VMSIG_VM_RUNNING) ? VMSIG_VM_PAUSED : VMSIG_VM_RUNNING; + emit_vm(a, a->cur, 0, 0); /* broadcast */ + return 0; + } + + if (a->st == ST_DEAD) return 0; + for (;;) { + if (a->buflen >= sizeof a->buf) a->buflen = 0; /* line overflow -> reset */ + ssize_t r = read(a->fd, a->buf + a->buflen, sizeof a->buf - a->buflen); + if (r == 0) { armed_dead(a); return 0; } + if (r < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) break; armed_dead(a); return 0; } + a->buflen += (size_t)r; + size_t start = 0; + for (size_t i = 0; i < a->buflen; i++) { + if (a->buf[i] == '\n') { a->buf[i] = 0; handle_line(a, a->buf + start); start = i + 1; } + } + if (start > 0) { memmove(a->buf, a->buf + start, a->buflen - start); a->buflen -= start; } + } + return 0; +} + +static int vh_submit(vmsig_adapter* a, const vmsig_event* ev) { + if (ev->kind != VMSIG_EV_CMD_VM) return 1; /* not for this seam */ + vmsig_vm_cmd cmd; + memcpy(&cmd, ev->inln, sizeof cmd); + + if (a->stub) { + uint32_t s; + switch (cmd.op) { + case VMSIG_VMOP_QUERY: s = a->cur; break; + case VMSIG_VMOP_CONT: s = a->cur = VMSIG_VM_RUNNING; break; + case VMSIG_VMOP_STOP: s = a->cur = VMSIG_VM_PAUSED; break; + case VMSIG_VMOP_RESET: s = VMSIG_VM_RESET; break; + case VMSIG_VMOP_POWERDOWN: s = VMSIG_VM_POWERDOWN; break; + case VMSIG_VMOP_QUIT: s = VMSIG_VM_SHUTDOWN; break; + default: return 1; + } + emit_vm(a, s, ev->origin, ev->corr); /* reply addressed to the initiator */ + return 0; + } + + if (a->st != ST_READY) return -1; + const char* q = op_qmp(cmd.op); + if (!q) return 1; + pend_ent* p = pend_alloc(a); + if (!p) return -1; /* backpressure: pending table is full */ + uint32_t id = ++a->next_id; + p->used = 1; p->id = id; p->origin = ev->origin; p->corr = ev->corr; p->op = (uint8_t)cmd.op; + char line[160]; + int len = snprintf(line, sizeof line, "{\"execute\":\"%s\",\"id\":%u}\r\n", q, id); + ssize_t r = write(a->fd, line, (size_t)len); + if (r != (ssize_t)len) { p->used = 0; return -1; } + return 0; +} + +static void vh_close(vmsig_adapter* a) { + if (!a) return; + if (a->fd >= 0) close(a->fd); + free(a); +} + +static const vmsig_adapter_ops VH_OPS = { + .name = "vmhost", .source = VMSIG_SRC_VMHOST, .codec = VMSIG_CODEC_VMHOST, + .open = vh_open, .attach = vh_attach, .on_readiness = vh_on_ready, + .submit = vh_submit, .close = vh_close +}; + +const vmsig_adapter_ops* vmsig_vmhost_ops(void) { return &VH_OPS; } diff --git a/src/cli.c b/src/cli.c new file mode 100644 index 0000000..b391c6d --- /dev/null +++ b/src/cli.c @@ -0,0 +1,182 @@ +/* cli.c — vmsig spine demonstrator (no real VM). + * + * Brings up the context + epoll core, attaches an in-proc control and a set of stub + * adapters (input/vmhost/memctx) on a single endpoint (VM 0). Proves the bidirectional seam: + * UP: SEAM_UP, VM_LIFECYCLE (vmhost stub tick), MEMCTX (kcr3+locator + RO-fd); + * DOWN: CMD_ACQUIRE+CMD_INPUT -> input adapter -> ACT_ACK (correlation); + * CMD_VM QUERY -> vmhost -> VM_LIFECYCLE (addressed reply). + * The address-space context arrives via MULTICAST: control receives kcr3 and a + * pre-opened O_RDONLY fd of the RAM region (control does NOT see ram_path; it mmaps + * the fd itself, write -> EACCES). (vgpu frame perception now lives in an out-of-repo + * S-lib that consumes this MEMCTX seam — not in signaling.) + * Shutdown: on SIGINT or automatically, once all paths are proven. */ +#include "vmsig.h" +#include +#include +#include +#include +#include + +static vmsig_core* g_core; +static void on_sigint(int s) { (void)s; if (g_core) vmsig_core_stop(g_core); } + +typedef struct { + vmsig_core* core; + void* ctl; + int total, lifecycles, acks, seams, memctx; + uint64_t last_kcr3; + uint32_t last_epoch; + int sent_first; /* sent acquire+input+vm on the first lifecycle tick */ +} demo; + +static const char* kind_name(vmsig_kind k) { + switch (k) { + case VMSIG_EV_SEAM_UP: return "SEAM_UP"; + case VMSIG_EV_SEAM_DOWN: return "SEAM_DOWN"; + case VMSIG_EV_VM_LIFECYCLE: return "VM_LIFECYCLE"; + case VMSIG_EV_ACT_ACK: return "ACT_ACK"; + case VMSIG_EV_MEMCTX: return "MEMCTX"; + default: return "?"; + } +} + +/* Core -> control: address-space context + pre-opened O_RDONLY fd of the RAM region. + * Demonstrate RO: mmap(PROT_READ) ok, mmap(PROT_WRITE) -> EACCES. The fd is borrowed + * (closed by the core after the call) — here we mmap and immediately unmap. */ +static int on_memctx(void* user, const vmsig_event* ev, int fd) { + demo* d = user; + const vmsig_memctx* m = (const vmsig_memctx*)ev->inln; + d->memctx++; + d->last_kcr3 = m->kcr3; d->last_epoch = m->epoch; + uint32_t nseg = 0; + const vmsig_memseg* segs = vmsig_memctx_segs(ev, &nseg); + printf(" UP MEMCTX ep=%u kcr3=%#llx low=%#llx epoch=%u nseg=%u rdonly=%d\n", + (unsigned)ev->endpoint, (unsigned long long)m->kcr3, + (unsigned long long)m->low, (unsigned)m->epoch, (unsigned)nseg, + (m->flags & VMSIG_MEMCTX_RDONLY) ? 1 : 0); + if (fd >= 0 && m->low) { + void* ro = mmap(NULL, (size_t)m->low, PROT_READ, MAP_SHARED, fd, 0); + if (ro != MAP_FAILED) { + void* rw = mmap(NULL, (size_t)m->low, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + printf(" RO-fd: mmap(PROT_READ) ok, mmap(PROT_WRITE) %s\n", + rw == MAP_FAILED ? "EACCES (RO enforced)" : "UNEXPECTEDLY succeeded!"); + if (rw != MAP_FAILED) munmap(rw, (size_t)m->low); + munmap(ro, (size_t)m->low); + } + } + (void)segs; + return 0; +} + +static int on_event(void* user, const vmsig_event* ev) { + demo* d = user; + d->total++; + switch (ev->kind) { + case VMSIG_EV_SEAM_UP: d->seams++; break; + case VMSIG_EV_ACT_ACK: d->acks++; break; + default: break; + } + + if (ev->kind == VMSIG_EV_VM_LIFECYCLE) { + d->lifecycles++; + vmsig_vm_state vs; memcpy(&vs, ev->inln, sizeof vs); + printf(" UP VM_LIFECYCLE ep=%u state=%u%s\n", + (unsigned)ev->endpoint, (unsigned)vs.state, ev->origin ? " (reply)" : ""); + } else if (ev->kind != VMSIG_EV_MEMCTX) { /* MEMCTX is printed in on_memctx */ + printf(" UP %-12s src=%u ep=%u seq=%u prio=%u\n", + kind_name(ev->kind), (unsigned)ev->source, (unsigned)ev->endpoint, + (unsigned)ev->seq, (unsigned)ev->prio); + } + + /* On the first lifecycle tick: acquire the INPUT lease, send input, and query VM status. */ + if (ev->kind == VMSIG_EV_VM_LIFECYCLE && !ev->origin && !d->sent_first) { + d->sent_first = 1; + /* Input is a destructive class: first acquire the exclusive INPUT lease. */ + vmsig_event acq; + memset(&acq, 0, sizeof acq); + acq.kind = VMSIG_EV_CMD_ACQUIRE; acq.source = VMSIG_SRC_INPUT; acq.dir = VMSIG_DIR_DOWN; + acq.prio = VMSIG_PRIO_HIGH; acq.endpoint = 0; + ((vmsig_lease_req*)acq.inln)->cls = VMSIG_LEASE_INPUT; + printf(" DOWN CMD_ACQUIRE INPUT@ep0\n"); + vmsig_inproc_send(d->ctl, &acq); + + vmsig_event in; + memset(&in, 0, sizeof in); + in.kind = VMSIG_EV_CMD_INPUT; in.source = VMSIG_SRC_INPUT; in.dir = VMSIG_DIR_DOWN; + in.prio = VMSIG_PRIO_HIGH; in.endpoint = 0; in.corr = 0xC0FFEEu; + in.payload.flags = VMSIG_PL_INLINE; + vmsig_input act; memset(&act, 0, sizeof act); /* neutral public input contract */ + act.kind = VMSIG_INPUT_ABS; act.code = 0; act.value = 100; /* demo: abs axis X = 100 */ + memcpy(in.inln, &act, sizeof act); + printf(" DOWN CMD_INPUT ABS axis=0 val=100 corr=0x%X\n", (unsigned)in.corr); + vmsig_inproc_send(d->ctl, &in); + + vmsig_event vm; + memset(&vm, 0, sizeof vm); + vm.kind = VMSIG_EV_CMD_VM; vm.source = VMSIG_SRC_VMHOST; vm.dir = VMSIG_DIR_DOWN; + vm.prio = VMSIG_PRIO_NORMAL; vm.endpoint = 0; vm.corr = 0x5Au; + vmsig_vm_cmd vc = { VMSIG_VMOP_QUERY }; + memcpy(vm.inln, &vc, sizeof vc); + printf(" DOWN CMD_VM QUERY\n"); + vmsig_inproc_send(d->ctl, &vm); + } + + /* All paths proven — stop (for automated verification). */ + if (d->memctx >= 1 && d->acks >= 1 && d->lifecycles >= 2) vmsig_core_stop(d->core); + return 0; +} + +int main(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + if (!ctx) { fprintf(stderr, "ctx_new failed\n"); return 1; } + vmsig_core* core = vmsig_core_new(ctx); + if (!core) { fprintf(stderr, "core_new failed\n"); vmsig_ctx_free(ctx); return 1; } + g_core = core; + signal(SIGINT, on_sigint); + + demo d; + memset(&d, 0, sizeof d); + d.core = core; + + vmsig_inproc_cfg ccfg; + memset(&ccfg, 0, sizeof ccfg); + ccfg.on_event = on_event; + ccfg.on_memctx = on_memctx; + ccfg.user = &d; + ccfg.sub.source_mask = 0; /* all sources */ + ccfg.sub.prio_min = VMSIG_PRIO_BULK; + ccfg.sub.endpoint_mask = 0; /* all VMs */ + + void* ctl = vmsig_inproc_control_new(&ccfg); + if (!ctl) { fprintf(stderr, "control_new failed\n"); vmsig_core_free(core); vmsig_ctx_free(ctx); return 1; } + d.ctl = ctl; + + /* Trusted in-proc control: full grant on VM 0 (the policy is set by the embedding + * program; for an out-of-process poller the grant would be issued upon authentication). */ + vmsig_grant grant; + memset(&grant, 0, sizeof grant); + grant.principal = 1; + grant.endpoint_mask = 1u << 0; + grant.source_mask = 0xFFFFFFFFu; + grant.cap_mask = VMSIG_CAP_OBSERVE | VMSIG_CAP_INPUT | VMSIG_CAP_LIFECYCLE | + VMSIG_CAP_MEMCTX | VMSIG_CAP_POWER | VMSIG_CAP_VM; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &grant); + + /* Single endpoint (VM 0), stub adapters (cfg = NULL). */ + if (vmsig_core_add_adapter(core, vmsig_input_ops(), NULL, 0) < 0 || + vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) < 0 || /* stub QEMU plane */ + vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) < 0) { /* stub AS context */ + fprintf(stderr, "add_adapter failed\n"); + vmsig_core_free(core); vmsig_ctx_free(ctx); return 1; + } + + printf("vmsig_cli: loop started (Ctrl-C to stop)\n"); + int rc = vmsig_core_run(core); + printf("vmsig_cli: loop finished rc=%d (events=%d seams=%d lifecycles=%d acks=%d memctx=%d kcr3=%#llx epoch=%u)\n", + rc, d.total, d.seams, d.lifecycles, d.acks, d.memctx, + (unsigned long long)d.last_kcr3, (unsigned)d.last_epoch); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + return rc; +} diff --git a/src/control/inproc.c b/src/control/inproc.c new file mode 100644 index 0000000..c72e8f2 --- /dev/null +++ b/src/control/inproc.c @@ -0,0 +1,57 @@ +/* inproc.c — reference in-process control: a thin shim turning a C callback into + * the neutral control vtable. fd = -1 (no epoll registration); UP arrives via a + * direct on_event call, DOWN leaves through the emit hook installed by the core. */ +#include "vmsig_control.h" +#include +#include + +typedef struct { + vmsig_inproc_cfg cfg; + int (*emit_down)(void* token, vmsig_event*); + void* token; +} inproc_ctl; + +static int ip_fd(void* ctl) { (void)ctl; return -1; } +static int ip_subscribe(void* ctl, vmsig_sub* out) { inproc_ctl* c = ctl; *out = c->cfg.sub; return 0; } +static int ip_deliver(void* ctl, const vmsig_event* ev) { + inproc_ctl* c = ctl; + return c->cfg.on_event ? c->cfg.on_event(c->cfg.user, ev) : 0; +} +static void ip_set_emit_down(void* ctl, int (*emit)(void* token, vmsig_event*), void* token) { + inproc_ctl* c = ctl; c->emit_down = emit; c->token = token; +} +static void ip_close(void* ctl) { free(ctl); } + +/* Core -> in-proc algorithm: address-space context (MEMCTX) + RO-fd as a direct int. + * The fd is borrowed (dup/mmap to retain it); the core closes it after the call. */ +static int ip_attach_memctx(void* ctl, const vmsig_event* ev, int fd) { + inproc_ctl* c = ctl; + if (!c->cfg.on_memctx) return -1; + return c->cfg.on_memctx(c->cfg.user, ev, fd); +} + +static const vmsig_control_ops IP_OPS = { + .name = "inproc", + .fd = ip_fd, + .subscribe = ip_subscribe, + .deliver = ip_deliver, + .on_readable = NULL, /* no fd — nothing to read */ + .set_emit_down = ip_set_emit_down, + .close = ip_close, + .attach_memctx = ip_attach_memctx +}; + +const vmsig_control_ops* vmsig_inproc_control_ops(void) { return &IP_OPS; } + +void* vmsig_inproc_control_new(const vmsig_inproc_cfg* cfg) { + inproc_ctl* c = calloc(1, sizeof *c); + if (!c) return NULL; + if (cfg) c->cfg = *cfg; + return c; +} + +int vmsig_inproc_send(void* ctl, vmsig_event* down) { + inproc_ctl* c = ctl; + if (!c || !c->emit_down) return -1; + return c->emit_down(c->token, down); +} diff --git a/src/control/socket.c b/src/control/socket.c new file mode 100644 index 0000000..074621f --- /dev/null +++ b/src/control/socket.c @@ -0,0 +1,318 @@ +/* socket.c — out-of-process control over a unix socket. + * + * The listener registers in the core as a SLOT_SOURCE (listen-fd). On accept the + * peer is authenticated via SO_PEERCRED, the policy issues a neutral grant; an empty + * grant => the connection is closed (not a valid poller). Otherwise a per-conn + * control is created: its fd is driven by the epoll core, DOWN frames are parsed and + * dispatched through emit_down (enforced by the grant), UP events are serialized into + * a frame. On EOF — deferred reap. + * + * DoS protection: per-uid limit of concurrent connections (against eviction of + * legitimate ones); a janitor timerfd detaches "stuck" partial frames (slowloris). + * The global ceiling and slot reuse live in the core. */ +#define _GNU_SOURCE +#include "vmsig_socket.h" +#include "core_internal.h" /* core_add_source, core_request_drop, add_control */ +#include +#include +#include +#include +#include /* umask */ +#include +#include +#include +#include +#include +#include +#include + +#define VMSIG_SOCK_PER_UID_MAX 8 /* concurrent connections per uid */ +#define VMSIG_SOCK_IDLE_NS (10ull * 1000000000ull) /* timeout for a stuck partial frame */ +#define VMSIG_SOCK_JANITOR_S 5 /* sweep period */ + +typedef struct sock_listener sock_listener; + +static uint64_t now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +/* ===== wire codec (public — also for external clients) ===== */ +void vmsig_wire_encode(vmsig_wire* w, const vmsig_event* ev) { + memset(w, 0, sizeof *w); + w->magic = VMSIG_WIRE_MAGIC; w->version = VMSIG_WIRE_VERSION; + w->kind = ev->kind; w->source = ev->source; w->dir = ev->dir; w->prio = ev->prio; + w->endpoint = ev->endpoint; w->corr = ev->corr; + memcpy(w->inln, ev->inln, sizeof w->inln); +} +int vmsig_wire_decode(const vmsig_wire* w, vmsig_event* ev) { + if (w->magic != VMSIG_WIRE_MAGIC || w->version != VMSIG_WIRE_VERSION) return -1; + memset(ev, 0, sizeof *ev); + ev->kind = w->kind; ev->source = w->source; ev->dir = w->dir; ev->prio = w->prio; + ev->endpoint = w->endpoint; ev->corr = w->corr; + ev->payload.flags = VMSIG_PL_INLINE; + memcpy(ev->inln, w->inln, sizeof ev->inln); + return 0; +} + +/* ===== per-conn control ===== */ +typedef struct sock_conn { + int fd; + vmsig_core* core; + int id; + uint32_t uid; + uint64_t last_ns; /* activity for the janitor */ + sock_listener* L; + struct sock_conn* lnext; /* listener's connection list */ + int (*emit_down)(void* token, vmsig_event*); + void* token; + uint8_t buf[sizeof(vmsig_wire)]; + size_t buflen; +} sock_conn; + +static int conn_fd(void* ctl) { return ((sock_conn*)ctl)->fd; } + +static int conn_subscribe(void* ctl, vmsig_sub* out) { + (void)ctl; memset(out, 0, sizeof *out); return 0; /* everything; the grant gates it */ +} + +static int conn_deliver(void* ctl, const vmsig_event* ev) { + sock_conn* c = ctl; + vmsig_wire w; + vmsig_wire_encode(&w, ev); + ssize_t r = write(c->fd, &w, sizeof w); /* best-effort; EAGAIN => frame dropped */ + (void)r; + return 0; +} + +static void conn_set_emit_down(void* ctl, int (*emit)(void* token, vmsig_event*), void* token) { + sock_conn* c = ctl; c->emit_down = emit; c->token = token; +} + +static int conn_on_readable(void* ctl) { + sock_conn* c = ctl; + for (;;) { + ssize_t n = read(c->fd, c->buf + c->buflen, sizeof c->buf - c->buflen); + if (n == 0) { core_request_drop(c->core, c->id); return 0; } /* EOF */ + if (n < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) break; + core_request_drop(c->core, c->id); + return 0; + } + c->last_ns = now_ns(); + c->buflen += (size_t)n; + if (c->buflen == sizeof c->buf) { + vmsig_event ev; + if (vmsig_wire_decode((const vmsig_wire*)c->buf, &ev) == 0) { + ev.dir = VMSIG_DIR_DOWN; /* from a poller — DOWN only */ + if (c->emit_down) c->emit_down(c->token, &ev); /* enforced by the grant */ + } + c->buflen = 0; + } + } + return 0; +} + +/* ===== listener ===== */ +struct sock_listener { + int listen_fd; + int janitor_fd; + vmsig_core* core; + vmsig_socket_policy policy; + void* ud; + sock_conn* conns; /* singly-linked list of active connections */ +}; + +static void listener_unlink(sock_listener* L, sock_conn* c) { + sock_conn** pp = &L->conns; + while (*pp) { if (*pp == c) { *pp = c->lnext; return; } pp = &(*pp)->lnext; } +} + +static int listener_uid_count(sock_listener* L, uint32_t uid) { + int n = 0; + for (sock_conn* c = L->conns; c; c = c->lnext) if (c->uid == uid) n++; + return n; +} + +static void conn_close(void* ctl) { + sock_conn* c = ctl; + if (c->L) listener_unlink(c->L, c); + if (c->fd >= 0) close(c->fd); + free(c); +} + +/* Send a SINGLE 80-byte vmsig_wire frame + ONE RO-fd in a cmsg (SCM_RIGHTS). This keeps + * the control-socket stream fixed-framed at sizeof(vmsig_wire): the client reads one + * frame via recvmsg and extracts the fd only on an fd-carrying frame. Partial cmsg + * transfer is not allowed (the fd is all-or-nothing): a short sendmsg -> -1. Shared + * primitive for the memctx handoff (one SCM_RIGHTS mechanism). */ +static int conn_send_fd_frame(sock_conn* c, const vmsig_wire* w, int fd) { + struct iovec iov; + iov.iov_base = (void*)w; + iov.iov_len = sizeof *w; + + union { + char buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr align; + } cm; + memset(&cm, 0, sizeof cm); + + struct msghdr mh; + memset(&mh, 0, sizeof mh); + mh.msg_iov = &iov; + mh.msg_iovlen = 1; + mh.msg_control = cm.buf; + mh.msg_controllen = sizeof cm.buf; + + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&mh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); + + for (;;) { + ssize_t n = sendmsg(c->fd, &mh, MSG_NOSIGNAL); + if (n < 0) { + if (errno == EINTR) continue; + return -1; + } + return ((size_t)n == sizeof *w) ? 0 : -1; /* partial frame -> failure */ + } +} + +/* Core -> socket-control: handoff of an address-space context (kind=MEMCTX, inln=vmsig_memctx + * POD) + RO-fd of the RAM region in a cmsg. The segs payload does NOT go on the wire (the + * fixed-framed vmsig_wire carries only inln); the holder opens it at `low`. */ +static int conn_attach_memctx(void* ctl, const vmsig_event* ev, int fd) { + sock_conn* c = ctl; + if (fd < 0 || !ev) return -1; + vmsig_wire w; + vmsig_wire_encode(&w, ev); /* kind=MEMCTX, inln=vmsig_memctx; payload is not serialized */ + return conn_send_fd_frame(c, &w, fd); +} + +static const vmsig_control_ops CONN_OPS = { + .name = "socket", + .fd = conn_fd, .subscribe = conn_subscribe, .deliver = conn_deliver, + .on_readable = conn_on_readable, .set_emit_down = conn_set_emit_down, .close = conn_close, + .attach_memctx = conn_attach_memctx +}; + +static void on_accept(void* user, uint32_t events) { + (void)events; + sock_listener* L = user; + for (;;) { + int fd = accept4(L->listen_fd, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC); + if (fd < 0) break; /* EAGAIN / other — done */ + + uint32_t uid = (uint32_t)-1, pid = 0; + struct ucred uc; socklen_t ul = sizeof uc; + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &uc, &ul) == 0) { + uid = (uint32_t)uc.uid; pid = (uint32_t)uc.pid; + } + vmsig_grant g; + if (L->policy) g = L->policy(uid, pid, L->ud); + else memset(&g, 0, sizeof g); + + if (g.cap_mask == 0 || g.endpoint_mask == 0) { /* not a valid poller */ + vmsig_audit a = { VMSIG_AUDIT_REJECT, uid, 0, 0, pid }; + core_audit(L->core, &a); + close(fd); + continue; + } + if (listener_uid_count(L, uid) >= VMSIG_SOCK_PER_UID_MAX) { /* anti-eviction */ + vmsig_audit a = { VMSIG_AUDIT_REJECT, uid, 0, 0, pid }; + core_audit(L->core, &a); + close(fd); + continue; + } + sock_conn* conn = calloc(1, sizeof *conn); + if (!conn) { close(fd); continue; } + conn->fd = fd; conn->core = L->core; conn->id = -1; + conn->uid = uid; conn->last_ns = now_ns(); conn->L = L; + conn->lnext = L->conns; L->conns = conn; + int id = vmsig_core_add_control(L->core, &CONN_OPS, conn, &g); + if (id < 0) { /* no slot — reject */ + vmsig_audit a = { VMSIG_AUDIT_REJECT, uid, 0, 0, pid }; + core_audit(L->core, &a); + listener_unlink(L, conn); close(fd); free(conn); continue; + } + conn->id = id; + vmsig_audit a = { VMSIG_AUDIT_ADMIT, g.principal, 0, 0, pid }; + core_audit(L->core, &a); + } +} + +/* janitor: detach connections with a stuck partial frame (slowloris) */ +static void on_janitor(void* user, uint32_t events) { + (void)events; + sock_listener* L = user; + uint64_t v; + while (read(L->janitor_fd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain */ } + uint64_t now = now_ns(); + for (sock_conn* c = L->conns; c; c = c->lnext) + if (c->buflen > 0 && now - c->last_ns > VMSIG_SOCK_IDLE_NS) + core_request_drop(c->core, c->id); +} + +/* listener cleanup on core_free (owner = the core, via on_free of the first source) */ +static void listener_free(void* user) { + sock_listener* L = user; + if (L->janitor_fd >= 0) close(L->janitor_fd); + if (L->listen_fd >= 0) close(L->listen_fd); + free(L); +} + +int vmsig_socket_attach(vmsig_core* core, const char* path, + vmsig_socket_policy policy, void* ud) { + if (!core || !path || !*path) return -1; + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0); + if (fd < 0) return -1; + + struct sockaddr_un addr; + memset(&addr, 0, sizeof addr); + addr.sun_family = AF_UNIX; + socklen_t alen; + size_t n = strlen(path); + if (path[0] == '@') { /* abstract namespace */ + if (n > sizeof addr.sun_path) { close(fd); return -1; } + addr.sun_path[0] = 0; + memcpy(addr.sun_path + 1, path + 1, n - 1); + alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n); + } else { /* filesystem path */ + if (n >= sizeof addr.sun_path) { close(fd); return -1; } + unlink(path); + memcpy(addr.sun_path, path, n); + alen = (socklen_t)sizeof addr; + } + /* Create the filesystem socket with restrictive perms (0600): the path must not be + * the only gate — connect requires write, so we open it to the owner only. + * (An abstract socket has no FS perms; its access is bounded by the net namespace.) */ + mode_t old_um = 0; + int restrict_perm = (path[0] != '@'); + if (restrict_perm) old_um = umask(0177); + int br = bind(fd, (struct sockaddr*)&addr, alen); + if (restrict_perm) umask(old_um); + if (br < 0) { close(fd); return -1; } + if (listen(fd, 64) < 0) { close(fd); return -1; } + + int jfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); + if (jfd < 0) { close(fd); return -1; } + struct itimerspec its; + memset(&its, 0, sizeof its); + its.it_interval.tv_sec = VMSIG_SOCK_JANITOR_S; + its.it_value = its.it_interval; + if (timerfd_settime(jfd, 0, &its, NULL) < 0) { close(jfd); close(fd); return -1; } + + sock_listener* L = calloc(1, sizeof *L); + if (!L) { close(jfd); close(fd); return -1; } + L->listen_fd = fd; L->janitor_fd = jfd; L->core = core; L->policy = policy; L->ud = ud; + /* the listen source owns the listener (on_free=listener_free closes both fds + free) */ + if (core_add_source(core, fd, on_accept, L, listener_free) < 0) { + close(jfd); close(fd); free(L); return -1; + } + /* janitor without on_free (L already belongs to the core); on error core_free releases it */ + if (core_add_source(core, jfd, on_janitor, L, NULL) < 0) return -1; + return 0; +} diff --git a/src/core/core.c b/src/core/core.c new file mode 100644 index 0000000..0dcd0dd --- /dev/null +++ b/src/core/core.c @@ -0,0 +1,224 @@ +/* core.c — core lifecycle and registration of adapters/controls. + * The loop and pumps live in loop.c. */ +#include "core_internal.h" +#include +#include +#include +#include +#include +#include + +core_slot* core_register_fd(vmsig_core* c, int fd, uint32_t epoll_events, slot_role role) { + /* reuse a detached (SLOT_DEAD) slot so c->slots[] does not grow on every + * connection */ + core_slot* s = NULL; + for (int i = 0; i < c->nslots; i++) + if (c->slots[i]->role == SLOT_DEAD) { s = c->slots[i]; break; } + + if (!s) { + if (c->nslots == c->cap_slots) { + int ncap = c->cap_slots ? c->cap_slots * 2 : 16; + core_slot** ns = realloc(c->slots, (size_t)ncap * sizeof *ns); + if (!ns) return NULL; + c->slots = ns; + c->cap_slots = ncap; + } + s = calloc(1, sizeof *s); + if (!s) return NULL; + c->slots[c->nslots++] = s; + } + + memset(s, 0, sizeof *s); + s->role = role; + s->fd = fd; + + struct epoll_event ee; + memset(&ee, 0, sizeof ee); + ee.events = epoll_events; + ee.data.ptr = s; + if (epoll_ctl(c->epfd, EPOLL_CTL_ADD, fd, &ee) < 0) { s->role = SLOT_DEAD; return NULL; } + + return s; +} + +vmsig_core* vmsig_core_new(vmsig_ctx* ctx) { + if (!ctx) return NULL; + vmsig_core* c = calloc(1, sizeof *c); + if (!c) return NULL; + c->ctx = ctx; + c->epfd = -1; + c->wake_fd = -1; + + c->epfd = epoll_create1(EPOLL_CLOEXEC); + if (c->epfd < 0) { free(c); return NULL; } + + c->wake_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (c->wake_fd < 0) { close(c->epfd); free(c); return NULL; } + if (!core_register_fd(c, c->wake_fd, EPOLLIN, SLOT_WAKEUP)) { + close(c->wake_fd); close(c->epfd); free(c); return NULL; + } + + /* context pacing timerfds (created in ctx_new) as loop sources */ + for (int d = VMSIG_DIR_UP; d <= VMSIG_DIR_DOWN; d++) { + int tfd = vmsig_ctx_timing_fd(ctx, (vmsig_dir)d); + if (tfd >= 0) core_register_fd(c, tfd, EPOLLIN, SLOT_CTX_TIMING); + } + return c; +} + +int vmsig_core_add_adapter(vmsig_core* c, const vmsig_adapter_ops* ops, + const void* cfg, uint32_t endpoint) { + if (!c || !ops || c->nadapters >= VMSIG_MAX_ADAPTERS) return -1; + + vmsig_adapter* a = ops->open(cfg, endpoint); + if (!a) return -1; + + vmsig_emit emit = { core_emit_up, core_register_memctx, core_unregister_memctx, c }; + vmsig_fd_reg reg[VMSIG_ADAPTER_FDS]; + memset(reg, 0, sizeof reg); + + int n = ops->attach(a, &emit, reg, VMSIG_ADAPTER_FDS); + if (n < 0) { ops->close(a); return -1; } + + for (int i = 0; i < n; i++) { + uint32_t events = reg[i].epoll_events ? reg[i].epoll_events : (uint32_t)EPOLLIN; + core_slot* s = core_register_fd(c, reg[i].fd, events, SLOT_ADAPTER); + if (!s) { ops->close(a); return -1; } + s->ops = ops; + s->adapter = a; + s->cookie = reg[i].cookie; + } + + int id = c->nadapters; + c->adapters[c->nadapters].ops = ops; + c->adapters[c->nadapters].a = a; + c->adapters[c->nadapters].endpoint = endpoint; + c->nadapters++; + return id; +} + +int vmsig_core_add_control(vmsig_core* c, const vmsig_control_ops* ops, void* ctl, + const vmsig_grant* grant) { + if (!c || !ops) return -1; + + /* reuse a freed (reaped) slot; otherwise grow up to the ceiling */ + int id = -1; + for (int i = 0; i < c->ncontrols; i++) + if (!c->controls[i].active) { id = i; break; } + if (id < 0) { + if (c->ncontrols >= VMSIG_MAX_CONTROLS) return -1; + id = c->ncontrols++; + } + core_control_ent* e = &c->controls[id]; + uint16_t gen = e->gen; /* generation survives the slot memset */ + memset(e, 0, sizeof *e); + e->gen = (uint16_t)(gen + 1); /* new generation for this (re)use */ + e->ops = ops; + e->ctl = ctl; + e->active = 1; + if (grant) e->grant = *grant; /* otherwise stays zero => default-deny */ + e->dctx.core = c; + e->dctx.ctl_id = id; + + if (ops->subscribe) ops->subscribe(ctl, &e->sub); + /* emit_down token is our down_ctx, so emit_down can find this control's grant */ + if (ops->set_emit_down) ops->set_emit_down(ctl, core_emit_down, &e->dctx); + + int fd = ops->fd ? ops->fd(ctl) : -1; + if (fd >= 0) { + core_slot* s = core_register_fd(c, fd, EPOLLIN, SLOT_CONTROL); + if (!s) return -1; + s->cops = ops; + s->ctl = ctl; + e->slot = s; + } + + /* Late subscriber: replay retained MEMCTX (if a context is already published and + * this control is qualified). For a control added BEFORE the first publication, + * the cell is not yet valid — it receives MEMCTX via the normal multicast in pump_up. */ + core_memctx_replay(c, id); + + return id; /* ncontrols already bumped when picking id (on growth); reuse does not grow it */ +} + + +/* ===== MEMCTX registration: per-endpoint retain cell (called by the adapter on the loop thread) ===== + * Registers the address-space context adapter's reg hooks. The core holds THIS and does + * NOT store a copy of the locator: on delivery/replay it calls reg.describe/share_fd. + * valid/epoch are maintained in route/epoch_bump (not here): register only records that + * "the adapter is connected". */ +int core_register_memctx(void* token, const vmsig_memctx_reg* reg) { + vmsig_core* c = token; + if (!c || !reg || reg->endpoint >= 64) return -1; + core_memctx_cell* cell = &c->memctx[reg->endpoint]; + cell->reg = *reg; + cell->registered = 1; + return 0; +} + +void core_unregister_memctx(void* token, uint32_t endpoint) { + vmsig_core* c = token; + if (!c || endpoint >= 64) return; + core_memctx_cell* cell = &c->memctx[endpoint]; + cell->registered = 0; + cell->valid = 0; + memset(&cell->reg, 0, sizeof cell->reg); +} + +void vmsig_core_set_audit(vmsig_core* c, void (*cb)(void* ud, const vmsig_audit* a), void* ud) { + if (!c) return; + c->audit_cb = cb; + c->audit_ud = ud; +} + +void core_audit(vmsig_core* c, const vmsig_audit* a) { + if (c && c->audit_cb) c->audit_cb(c->audit_ud, a); +} + +void vmsig_core_set_arb_policy(vmsig_core* c, vmsig_arb_policy cb, void* ud) { + if (!c) return; + c->arb_cb = cb; + c->arb_ud = ud; + /* lease[][] is zeroed in vmsig_core_new (calloc) => all cells free. */ +} + +int core_add_source(vmsig_core* c, int fd, void (*cb)(void* user, uint32_t events), + void* user, void (*on_free)(void* user)) { + if (!c || fd < 0 || !cb) return -1; + core_slot* s = core_register_fd(c, fd, EPOLLIN, SLOT_SOURCE); + if (!s) return -1; + s->on_source = cb; + s->on_free = on_free; + s->source_user = user; + return 0; +} + +void core_request_drop(vmsig_core* c, int ctl_id) { + if (!c || ctl_id < 0 || ctl_id >= c->ncontrols) return; + c->controls[ctl_id].reap = 1; + core_wake(c); /* wake the loop for a reap pass (without stop) */ +} + +void vmsig_core_free(vmsig_core* c) { + if (!c) return; + /* graceful: stop workers and close SI handles / sockets. Adapters are closed + * FIRST: their close stops off-loop workers and unregisters their seams (e.g. + * memctx) BEFORE destruction. */ + for (int i = 0; i < c->nadapters; i++) + if (c->adapters[i].ops->close) c->adapters[i].ops->close(c->adapters[i].a); + for (int i = 0; i < c->ncontrols; i++) + if (c->controls[i].active && c->controls[i].ops->close) + c->controls[i].ops->close(c->controls[i].ctl); + + /* cleanup of fd sources (e.g. unix listener: close listen/janitor fd + free) */ + for (int i = 0; i < c->nslots; i++) + if (c->slots[i]->role == SLOT_SOURCE && c->slots[i]->on_free) + c->slots[i]->on_free(c->slots[i]->source_user); + + for (int i = 0; i < c->nslots; i++) free(c->slots[i]); + free(c->slots); + if (c->wake_fd >= 0) close(c->wake_fd); + if (c->epfd >= 0) close(c->epfd); + /* ctx is not ours: its owner frees it */ + free(c); +} diff --git a/src/core/include/core_internal.h b/src/core/include/core_internal.h new file mode 100644 index 0000000..b0f961b --- /dev/null +++ b/src/core/include/core_internal.h @@ -0,0 +1,170 @@ +#ifndef VMSIG_CORE_INTERNAL_H +#define VMSIG_CORE_INTERNAL_H +#include "vmsig_core.h" +#include + +/* Private internals of the epoll core. Each registered fd carries a + * core_slot* in epoll_event.data.ptr; the slot's role decides how to handle it. */ + +#define VMSIG_MAX_EVENTS 64 +#define VMSIG_MAX_ADAPTERS 256 /* up to ~64 VMs * 3 adapters + slack (mode A) */ +#define VMSIG_MAX_CONTROLS 64 /* concurrent pollers; more => processes (C) */ +#define VMSIG_ADAPTER_FDS 8 /* max fds per adapter */ +#define VMSIG_DOWN_PENDING_MAX 256 /* ceiling of DOWN commands per poller in ctx (fairness) */ + +typedef enum { + SLOT_WAKEUP, /* wake/stop eventfd */ + SLOT_ADAPTER, /* adapter fd (timerfd/eventfd/socket) */ + SLOT_CTX_TIMING, /* context pacing timerfd */ + SLOT_CONTROL, /* out-of-process control socket */ + SLOT_SOURCE, /* arbitrary fd + callback (e.g. listen-fd) */ + SLOT_DEAD /* detached (reaped); loop ignores it */ +} slot_role; + +typedef struct core_slot { + slot_role role; + int fd; + /* for SLOT_ADAPTER */ + const vmsig_adapter_ops* ops; + vmsig_adapter* adapter; + uint32_t cookie; + /* for SLOT_CONTROL */ + const vmsig_control_ops* cops; + void* ctl; + /* for SLOT_SOURCE */ + void (*on_source)(void* user, uint32_t events); + void (*on_free)(void* user); /* invoked at core_free (source cleanup) */ + void* source_user; +} core_slot; + +typedef struct { + const vmsig_adapter_ops* ops; + vmsig_adapter* a; + uint32_t endpoint; +} core_adapter_ent; + + +/* ===== Retained address-space context (MEMCTX seam) ===== + * The core retains per-endpoint "a current context exists in the current epoch" + the + * adapter's reg pointer (describe/share_fd/invalidate). Replays to a late qualified + * subscriber (CAP_MEMCTX + source_mask + endpoint) re-sharing the RO-fd. Does NOT store a + * copy of the locator: on delivery/replay it calls reg.describe (adapter snapshot) + + * reg.share_fd (fresh RO-fd). Invalidated on epoch change; cleared on unregister/free. */ +typedef struct { + int registered; /* adapter called register_memctx (reg valid) */ + int valid; /* a published context exists in the current epoch */ + uint32_t epoch; /* snapshot epoch (== core epoch[ep] when valid) */ + vmsig_memctx_reg reg; /* valid when registered */ +} core_memctx_cell; + +/* ===== Lease layer (arbitration of exclusive ownership of destructive resources) ===== + * One cell per (endpoint, lease-class): who owns it (origin) + a snapshot of arb_prio at + * acquisition time. owner=0 => free. The snapshot (rather than the live grant) makes the + * policy resilient to the owner's grant changing after acquisition. */ +#define VMSIG_LEASE_CLASSES 3 /* INPUT, POWER, MEMWRITE (== VMSIG_LEASE_CLASS_MAX) */ +typedef struct { + uint32_t owner; /* origin (gen<<16)|(id+1) of the owner; 0 = free */ + uint32_t owner_prio; /* owner's arb_prio at acquisition time (snapshot) */ +} core_lease_cell; + +struct vmsig_core; /* fwd for core_down_ctx */ + +/* DOWN emission context: handed to a control in set_emit_down so emit_down knows WHICH + * control issued the command (for grant lookup and enforcement). Stable: lives in the + * fixed controls[] array. */ +typedef struct { + struct vmsig_core* core; + int ctl_id; +} core_down_ctx; + +typedef struct { + const vmsig_control_ops* ops; + void* ctl; + vmsig_sub sub; + vmsig_grant grant; /* poller's rights ceiling (default-deny) */ + core_down_ctx dctx; /* token for emit_down */ + int active; /* 0 = detached/reaped (slot free) */ + int reap; /* reap requested (deferred) */ + core_slot* slot; /* SLOT_CONTROL fd slot (or NULL) */ + uint32_t pending; /* DOWN commands of this poller in ctx (fairness cap) */ + uint16_t gen; /* slot generation: +1 on each (re)use */ +} core_control_ent; + +struct vmsig_core { + int epfd; + int wake_fd; /* eventfd: nudge + stop */ + vmsig_ctx* ctx; + volatile sig_atomic_t stopping; + + core_adapter_ent adapters[VMSIG_MAX_ADAPTERS]; + int nadapters; + core_control_ent controls[VMSIG_MAX_CONTROLS]; + int ncontrols; + + core_slot** slots; /* all allocated slots (for free) */ + int nslots; + int cap_slots; + + + uint32_t epoch[64]; /* per-endpoint VM session epoch */ + core_memctx_cell memctx[64]; /* per-endpoint retained context */ + + core_lease_cell lease[64][VMSIG_LEASE_CLASSES]; /* lease per (endpoint, class) */ + vmsig_arb_policy arb_cb; /* preemption policy (NULL=default) */ + void* arb_ud; + + void (*audit_cb)(void* ud, const vmsig_audit* a); + void* audit_ud; +}; + +/* Emit an audit record (no-op if no callback is set). Defined in core.c. */ +void core_audit(vmsig_core* c, const vmsig_audit* a); + +/* Register an fd in epoll + create a slot (see core.c). */ +core_slot* core_register_fd(vmsig_core* c, int fd, uint32_t epoll_events, slot_role role); + +/* Register an arbitrary fd source with a callback (e.g. a socket listen-fd). + * The callback is called on the loop thread when the fd is ready. on_free (may be NULL) + * is called at vmsig_core_free to clean up the source's resource. 0/-1. */ +int core_add_source(vmsig_core* c, int fd, void (*cb)(void* user, uint32_t events), + void* user, void (*on_free)(void* user)); + +/* Request detaching a control by id (deferred reap after the batch: epoll DEL, + * close fd, ops->close). Safe to call from the control's own on_readable. */ +void core_request_drop(vmsig_core* c, int ctl_id); + +/* emit hooks handed to adapters (UP) and controls (DOWN). Defined in loop.c. */ +int core_emit_up (void* token, vmsig_event* ev); +int core_emit_down(void* token, vmsig_event* ev); + +/* ===== Address-space context (MEMCTX seam; retained context) ===== */ +/* Context registration hooks (handed to the adapter in vmsig_emit; defined in core.c). */ +int core_register_memctx (void* token, const vmsig_memctx_reg* reg); +void core_unregister_memctx(void* token, uint32_t endpoint); + +/* Multicast MEMCTX to qualified subscribers + mark the retain cell valid + * (from pump_up on the VMSIG_EV_MEMCTX trigger; defined in loop.c). */ +void core_memctx_route(vmsig_core* c, const vmsig_event* trigger); + +/* Replay retained MEMCTX to a single (late) subscriber (from vmsig_core_add_control; + * defined in loop.c). */ +void core_memctx_replay(vmsig_core* c, int ctl_id); + +/* Bump the endpoint's epoch on a destructive lifecycle transition: epoch++, invalidate + * the retain cell, emit MEMCTX_INVALIDATED, request re-bootstrap from the adapter. + * Observed by the core in pump_up on UP VM_LIFECYCLE (defined in loop.c). */ +void core_epoch_bump(vmsig_core* c, uint32_t endpoint); + +/* ===== Lease layer (defined in loop.c) ===== */ +/* Intercept CMD_ACQUIRE/RELEASE/LEASE_STATUS (synchronously from core_emit_down, not in ctx). */ +void core_lease_acquire(vmsig_core* c, int ctl_id, const vmsig_event* ev); +void core_lease_release(vmsig_core* c, int ctl_id, const vmsig_event* ev); +void core_lease_status (vmsig_core* c, int ctl_id, const vmsig_event* ev); + +/* Reclaim the lease of a dead control (from core_reap, BEFORE e->active=0). */ +void core_lease_reap_control(vmsig_core* c, int ctl_id); + +/* Wake the loop (eventfd nudge). Defined in loop.c. */ +void core_wake(vmsig_core* c); + +#endif /* VMSIG_CORE_INTERNAL_H */ diff --git a/src/core/linux/loop.c b/src/core/linux/loop.c new file mode 100644 index 0000000..d2eb072 --- /dev/null +++ b/src/core/linux/loop.c @@ -0,0 +1,620 @@ +/* loop.c — non-blocking epoll loop, dispatch, pump up/down, emit hooks, + * graceful shutdown. No sleep/polling/busy-wait: every wakeup is an fd. */ +#include "core_internal.h" +#include +#include +#include +#include +#include + +static void drain_counter_fd(int fd) { + uint64_t v; + while (read(fd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain */ } +} + +void core_wake(vmsig_core* c) { + uint64_t one = 1; + ssize_t r = write(c->wake_fd, &one, sizeof one); + (void)r; +} + +int core_emit_up(void* token, vmsig_event* ev) { + vmsig_core* c = token; + int r = vmsig_ctx_submit(c->ctx, VMSIG_DIR_UP, ev); + core_wake(c); /* nudge in case of emission off the loop thread */ + return r; +} + +/* origin = (gen<<16)|(id+1): low 16 bits are the control's id+1, high bits the slot + * generation. Lets a reply be addressed to the initiator and stale reuse filtered out. */ +static uint32_t origin_pack(int id, uint16_t gen) { + return ((uint32_t)gen << 16) | ((uint32_t)(id + 1) & 0xFFFFu); +} +/* Live control by origin with generation check; NULL if gone/slot reused. */ +static core_control_ent* origin_ctl(vmsig_core* c, uint32_t origin) { + if (!origin) return NULL; + int id = (int)(origin & 0xFFFFu) - 1; + uint16_t gen = (uint16_t)(origin >> 16); + if (id < 0 || id >= c->ncontrols) return NULL; + core_control_ent* e = &c->controls[id]; + if (!e->active || e->gen != gen) return NULL; + return e; +} + +/* Capability for a DOWN command (unknown => deny). Destructive CMD_LIFECYCLE + * (powerdown/reset, code in inln[0]) requires CAP_POWER, safe ones CAP_LIFECYCLE. */ +static uint32_t cap_for_down(const vmsig_event* ev) { + switch (ev->kind) { + case VMSIG_EV_CMD_INPUT: + case VMSIG_EV_CMD_QUERY_INPUT: return VMSIG_CAP_INPUT; /* injection / held-key query */ + case VMSIG_EV_CMD_LIFECYCLE: + return (ev->inln[0] == VMSIG_LIFE_POWERDOWN || ev->inln[0] == VMSIG_LIFE_RESET) + ? VMSIG_CAP_POWER : VMSIG_CAP_LIFECYCLE; + case VMSIG_EV_CMD_VM: /* op in inln[0] (vmsig_vm_cmd, op<256) */ + return (ev->inln[0] == VMSIG_VMOP_RESET || ev->inln[0] == VMSIG_VMOP_POWERDOWN || + ev->inln[0] == VMSIG_VMOP_QUIT) ? VMSIG_CAP_POWER : VMSIG_CAP_VM; + case VMSIG_EV_CMD_MEMWRITE: return VMSIG_CAP_MEMWRITE; /* atomic guest-memory write */ + default: return 0; + } +} +/* ===== Lease layer: classification and helpers ===== */ + +/* Lease class for a DOWN command. MIRRORS cap_for_down by destructiveness: + * - CMD_INPUT -> INPUT; + * - CMD_LIFECYCLE powerdown/reset -> POWER; + * - CMD_VM reset/powerdown/quit -> POWER; + * - everything else (safe/read-only/stream/query) -> -1 (not lease-gated). + * CMD_LIFECYCLE and CMD_VM route to DIFFERENT adapters (INPUT/VMHOST) but share ONE + * POWER class per endpoint: a single owner of VM destruction (intentional). */ +static int lease_class_for_down(const vmsig_event* ev) { + switch (ev->kind) { + case VMSIG_EV_CMD_INPUT: + return VMSIG_LEASE_INPUT; + case VMSIG_EV_CMD_LIFECYCLE: + return (ev->inln[0] == VMSIG_LIFE_POWERDOWN || ev->inln[0] == VMSIG_LIFE_RESET) + ? VMSIG_LEASE_POWER : -1; + case VMSIG_EV_CMD_VM: + return (ev->inln[0] == VMSIG_VMOP_RESET || ev->inln[0] == VMSIG_VMOP_POWERDOWN || + ev->inln[0] == VMSIG_VMOP_QUIT) ? VMSIG_LEASE_POWER : -1; + case VMSIG_EV_CMD_MEMWRITE: + return VMSIG_LEASE_MEMWRITE; /* always destructive (write to shared guest memory) */ + default: + return -1; + } +} +/* Cap required to lease a class (probing/holding a class without the cap is forbidden). */ +static uint32_t cap_for_lease_class(int cls) { + return cls == VMSIG_LEASE_INPUT ? VMSIG_CAP_INPUT + : cls == VMSIG_LEASE_POWER ? VMSIG_CAP_POWER + : cls == VMSIG_LEASE_MEMWRITE ? VMSIG_CAP_MEMWRITE : 0u; +} +/* Source bitmask permitted to hold a lease class: mirrors the grant's source ceiling + * (which grant_allows_down enforces on the command itself). Leasing is intercepted + * BEFORE grant_allows_down, so source is checked HERE — otherwise a principal without + * the required seam could hold someone else's cell (DoS), bypassing source_mask. + * INPUT -> SRC_INPUT; POWER -> SRC_INPUT (lifecycle) OR SRC_VMHOST (vm) — one + * destructive path suffices; MEMWRITE -> SRC_MEMCTX (lives on the MEMCTX seam). */ +static uint32_t source_mask_for_lease_class(int cls) { + return cls == VMSIG_LEASE_INPUT ? (1u << VMSIG_SRC_INPUT) + : cls == VMSIG_LEASE_POWER ? ((1u << VMSIG_SRC_INPUT) | (1u << VMSIG_SRC_VMHOST)) + : cls == VMSIG_LEASE_MEMWRITE ? (1u << VMSIG_SRC_MEMCTX) : 0u; +} + +/* Capability to receive an UP event: address-space context (MEMCTX/MEMCTX_INVALIDATED) + * -> CAP_MEMCTX; cursor is screen data, available to a GUI observer (OBSERVE) OR an + * input actor (INPUT); otherwise CAP_OBSERVE (frames/SEAM/generic). The grant_allows_up + * gate checks intersection, so OBSERVE|INPUT means "either of the two". */ +static uint32_t cap_for_up(const vmsig_event* ev) { + if (ev->kind == VMSIG_EV_CURSOR_STATE) return VMSIG_CAP_OBSERVE | VMSIG_CAP_INPUT; + return (ev->source == VMSIG_SRC_MEMCTX) ? VMSIG_CAP_MEMCTX : VMSIG_CAP_OBSERVE; +} +static int grant_allows_down(const vmsig_grant* g, const vmsig_event* ev) { + if (ev->endpoint >= 64) return 0; /* 64-bit mask: <=64 VMs/cores */ + if (!(g->endpoint_mask & (1ull << ev->endpoint))) return 0; + if (!(g->source_mask & (1u << ev->source))) return 0; /* source ceiling on DOWN too */ + uint32_t need = cap_for_down(ev); + return need && (g->cap_mask & need); +} +static int grant_allows_up(const vmsig_grant* g, const vmsig_event* ev) { + if (ev->endpoint >= 64) return 0; + if (!(g->cap_mask & cap_for_up(ev))) return 0; + if (!(g->endpoint_mask & (1ull << ev->endpoint))) return 0; + if (!(g->source_mask & (1u << ev->source))) return 0; + return 1; +} + +/* Find an adapter by (endpoint, source). NULL if none. Used by pump_down to route a + * DOWN command to its adapter. */ +static core_adapter_ent* core_find_adapter(vmsig_core* c, uint32_t endpoint, + vmsig_source source) { + for (int i = 0; i < c->nadapters; i++) { + core_adapter_ent* e = &c->adapters[i]; + if (e->ops->source == source && e->endpoint == endpoint) return e; + } + return NULL; +} + +/* ===== Lease layer: grant/release/status/finalization/reclaim ===== + * Intercepted in core_emit_down BEFORE grant_allows_down (synchronous, not in ctx, does + * not touch pending). Addressed UP replies to the initiator via core_emit_up + * (origin+generation). */ + +/* Addressed UP reply to the initiator of a lease request. */ +static void lease_reply(vmsig_core* c, const vmsig_event* req, vmsig_kind kind, + uint32_t cls, uint32_t reason) { + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = kind; up.source = VMSIG_SRC_CORE; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_URGENT; up.endpoint = req->endpoint; up.origin = req->origin; + vmsig_lease_req lr = { cls, reason }; + memcpy(up.inln, &lr, sizeof lr); + core_emit_up(c, &up); +} + +/* Lease denial: audit (visibility of authorization/contention denials — capability/ + * endpoint enumeration via ACQUIRE is observable) + addressed LEASE_DENIED to initiator. */ +static void lease_deny(vmsig_core* c, const vmsig_event* req, uint32_t principal, + uint32_t cls, uint32_t reason) { + vmsig_audit a = { VMSIG_AUDIT_LEASE_DENIED, principal, req->endpoint, cls, reason }; + core_audit(c, &a); + lease_reply(c, req, VMSIG_EV_LEASE_DENIED, cls, reason); +} + +/* Principal of the cell owner (for STATUS); 0 if owner is dead/absent. */ +static uint32_t lease_owner_principal(vmsig_core* c, uint32_t owner) { + core_control_ent* e = origin_ctl(c, owner); + return e ? e->grant.principal : 0u; +} + +/* IMPORTANT (layer isolation): signaling does NOT release held keys on lease loss and + * does NOT track held state at all. held is the ACTUATOR's record (vmctl); release is the + * control's decision. On owner change/reset the cell is simply freed; stuck keys remain + * the control's concern (it can issue CMD_QUERY_INPUT and release its own while owner). */ + +void core_lease_acquire(vmsig_core* c, int ctl_id, const vmsig_event* ev) { + core_control_ent* e = &c->controls[ctl_id]; + uint32_t cls = ((const vmsig_lease_req*)ev->inln)->cls; + uint32_t ep = ev->endpoint; + + /* 1. validate class/endpoint/grant (default-deny; every denial is audited). */ + if (cls >= VMSIG_LEASE_CLASS_MAX) { + lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_BADCLASS); + return; + } + if (ep >= 64 || !(e->grant.endpoint_mask & (1ull << ep))) { + lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_NOGRANT); + return; + } + if (!(e->grant.cap_mask & cap_for_lease_class((int)cls))) { + lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_NOCAP); + return; + } + /* source ceiling: holding a class without rights to its seam is forbidden (else a + * DoS hold of someone else's cell bypassing source_mask, since interception is + * BEFORE grant_allows_down). */ + if (!(e->grant.source_mask & source_mask_for_lease_class((int)cls))) { + lease_deny(c, ev, e->grant.principal, cls, VMSIG_LEASE_DENY_NOGRANT); + return; + } + + core_lease_cell* cell = &c->lease[ep][cls]; + uint32_t me = ev->origin; + + /* 2a. free OR dead owner (origin_ctl==NULL) => take as if free. */ + core_control_ent* owner_e = cell->owner ? origin_ctl(c, cell->owner) : NULL; + if (cell->owner == 0 || !owner_e) { + cell->owner = me; cell->owner_prio = e->grant.arb_prio; + vmsig_audit a = { VMSIG_AUDIT_LEASE_GRANTED, e->grant.principal, ep, cls, 0 }; + core_audit(c, &a); + lease_reply(c, ev, VMSIG_EV_LEASE_GRANTED, cls, 0); + return; + } + + /* 2b. owner is the caller itself => idempotent GRANTED. */ + if (cell->owner == me) { + lease_reply(c, ev, VMSIG_EV_LEASE_GRANTED, cls, 0); + return; + } + + /* 2c. held by a LIVE other owner => policy. incumbent is the live grant. */ + vmsig_arb_decision dec; + if (c->arb_cb) { + dec = c->arb_cb(c->arb_ud, ep, cls, &owner_e->grant, &e->grant); + } else { + dec = (e->grant.arb_prio > cell->owner_prio) ? VMSIG_ARB_PREEMPT : VMSIG_ARB_DENY; + } + if (dec != VMSIG_ARB_PREEMPT) { + /* equal priority => owner keeps it (HELD); strictly lower => LOWER_PRIO. */ + uint32_t reason = (e->grant.arb_prio < cell->owner_prio) + ? VMSIG_LEASE_DENY_LOWER_PRIO : VMSIG_LEASE_DENY_HELD; + lease_deny(c, ev, e->grant.principal, cls, reason); + return; + } + + /* PREEMPT: notify the old owner (REVOKED), switch owner, grant to the new one. + * signaling does NOT release held keys (that is the control's decision): the + * ex-owner is responsible for its stuck keys; the new owner can query held + * (CMD_QUERY_INPUT) and release them. */ + uint32_t old_owner = cell->owner; + { + vmsig_event rv; + memset(&rv, 0, sizeof rv); + rv.endpoint = ep; rv.origin = old_owner; + lease_reply(c, &rv, VMSIG_EV_LEASE_REVOKED, cls, 0); + } + { + vmsig_audit a = { VMSIG_AUDIT_LEASE_REVOKED, owner_e->grant.principal, ep, cls, 0 }; + core_audit(c, &a); + } + cell->owner = me; cell->owner_prio = e->grant.arb_prio; + { + vmsig_audit a = { VMSIG_AUDIT_LEASE_GRANTED, e->grant.principal, ep, cls, 0 }; + core_audit(c, &a); + } + lease_reply(c, ev, VMSIG_EV_LEASE_GRANTED, cls, 0); +} + +void core_lease_release(vmsig_core* c, int ctl_id, const vmsig_event* ev) { + core_control_ent* e = &c->controls[ctl_id]; + uint32_t cls = ((const vmsig_lease_req*)ev->inln)->cls; + uint32_t ep = ev->endpoint; + + /* cross-endpoint isolation + cap/source gate BEFORE any action (like acquire). */ + if (cls >= VMSIG_LEASE_CLASS_MAX || ep >= 64) return; + if (!(e->grant.endpoint_mask & (1ull << ep))) return; + if (!(e->grant.cap_mask & cap_for_lease_class((int)cls))) return; + if (!(e->grant.source_mask & source_mask_for_lease_class((int)cls))) return; + + core_lease_cell* cell = &c->lease[ep][cls]; + if (cell->owner != ev->origin) return; /* not owner => no-op */ + + /* signaling does NOT release held keys — that is the control's decision (it releases + * its own keys before release if needed). Here we only free the cell. */ + cell->owner = 0; cell->owner_prio = 0; + lease_reply(c, ev, VMSIG_EV_LEASE_RELEASED, cls, 0); +} + +void core_lease_status(vmsig_core* c, int ctl_id, const vmsig_event* ev) { + core_control_ent* e = &c->controls[ctl_id]; + uint32_t cls = ((const vmsig_lease_req*)ev->inln)->cls; + uint32_t ep = ev->endpoint; + + /* busy-state can be probed only within one's own endpoint and with the class cap + * (else a principal without CAP_INPUT/CAP_POWER would leak busy-state/other principal). */ + if (cls >= VMSIG_LEASE_CLASS_MAX || ep >= 64) return; + if (!(e->grant.endpoint_mask & (1ull << ep))) return; + if (!(e->grant.cap_mask & cap_for_lease_class((int)cls))) return; + if (!(e->grant.source_mask & source_mask_for_lease_class((int)cls))) return; + + core_lease_cell* cell = &c->lease[ep][cls]; + uint32_t busy = (cell->owner && origin_ctl(c, cell->owner)) ? 1u : 0u; + + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_LEASE_STATUS; up.source = VMSIG_SRC_CORE; up.dir = VMSIG_DIR_UP; + up.prio = VMSIG_PRIO_URGENT; up.endpoint = ep; up.origin = ev->origin; + vmsig_lease_status st = { cls, busy, busy ? lease_owner_principal(c, cell->owner) : 0u }; + memcpy(up.inln, &st, sizeof st); + core_emit_up(c, &up); +} + +void core_lease_reap_control(vmsig_core* c, int ctl_id) { + /* Clear all cells owned by this (still live) slot, BEFORE active=0. + * origin is compared by the slot's current gen (active, gen valid at reap time). */ + uint32_t owner = origin_pack(ctl_id, c->controls[ctl_id].gen); + for (uint32_t ep = 0; ep < 64; ep++) { + for (int cls = 0; cls < VMSIG_LEASE_CLASSES; cls++) { + core_lease_cell* cell = &c->lease[ep][cls]; + if (cell->owner != owner) continue; + /* only free the cell; the dead owner's held keys are NOT our concern (vmctl's + * record; the next owner sees them via CMD_QUERY_INPUT and decides itself). */ + cell->owner = 0; cell->owner_prio = 0; + vmsig_audit a = { VMSIG_AUDIT_LEASE_RECLAIMED, + c->controls[ctl_id].grant.principal, ep, (uint32_t)cls, 0 }; + core_audit(c, &a); + } + } +} + +/* DOWN emit from a control: enforcement against THIS control's grant. */ +int core_emit_down(void* token, vmsig_event* ev) { + core_down_ctx* d = token; + vmsig_core* c = d->core; + core_control_ent* e = &c->controls[d->ctl_id]; + if (!e->active) { vmsig_payload_release(ev); return -1; } + + /* Lease arbitration is intercepted HERE (synchronous, not in ctx, does not touch + * pending). origin is needed for the addressed reply and as the owner key. */ + if (ev->kind == VMSIG_EV_CMD_ACQUIRE || ev->kind == VMSIG_EV_CMD_RELEASE || + ev->kind == VMSIG_EV_CMD_LEASE_STATUS) { + ev->origin = origin_pack(d->ctl_id, e->gen); + if (ev->kind == VMSIG_EV_CMD_ACQUIRE) core_lease_acquire(c, d->ctl_id, ev); + else if (ev->kind == VMSIG_EV_CMD_RELEASE) core_lease_release(c, d->ctl_id, ev); + else core_lease_status(c, d->ctl_id, ev); + vmsig_payload_release(ev); + return 0; + } + + if (!grant_allows_down(&e->grant, ev)) { + vmsig_audit a = { VMSIG_AUDIT_DOWN_DENIED, e->grant.principal, + ev->endpoint, (uint32_t)ev->kind, 0 }; + core_audit(c, &a); /* rejected by policy (endpoint/source/class) */ + vmsig_payload_release(ev); + return -1; + } + + /* Lease GATE: destruction is passed ONLY by the class's current owner. + * A non-owner (or an owner whose slot is dead) => drop + audit LEASE_DENIED + * (distinguishable from grant-deny). A free cell => also drop: destruction cannot be + * used without an explicit lease. Safe/read-only commands (cls<0) are not gated. */ + { + int cls = lease_class_for_down(ev); + if (cls >= 0 && ev->endpoint < 64) { + uint32_t me = origin_pack(d->ctl_id, e->gen); + uint32_t owner = c->lease[ev->endpoint][cls].owner; + if (owner != me || !origin_ctl(c, owner)) { + vmsig_audit a = { VMSIG_AUDIT_LEASE_DENIED, e->grant.principal, + ev->endpoint, (uint32_t)ev->kind, 0 }; + core_audit(c, &a); + vmsig_payload_release(ev); + return -1; + } + } + } + + if (e->pending >= VMSIG_DOWN_PENDING_MAX) { /* fairness/DoS: DOWN cap per poller */ + vmsig_audit a = { VMSIG_AUDIT_DOWN_DENIED, e->grant.principal, + ev->endpoint, (uint32_t)ev->kind, 0 }; + core_audit(c, &a); + vmsig_payload_release(ev); + return -1; + } + ev->origin = origin_pack(d->ctl_id, e->gen); /* addressed reply + pending accounting */ + e->pending++; + int r = vmsig_ctx_submit(c->ctx, VMSIG_DIR_DOWN, ev); + if (r != 0) e->pending--; /* not enqueued (drop/err) */ + core_wake(c); + return r; +} + +static int sub_match(const vmsig_sub* sub, const vmsig_event* ev) { + if (sub->source_mask && !(sub->source_mask & (1u << ev->source))) return 0; + if (ev->prio < sub->prio_min) return 0; + if (sub->endpoint_mask) { + if (ev->endpoint >= 64 || !(sub->endpoint_mask & (1ull << ev->endpoint))) return 0; + } + return 1; +} + +/* ===== Address-space context (MEMCTX seam): multicast / retain-replay / epoch ===== + * The core vends ONE coherent datum per-endpoint: kcr3+locator paired with an RO-fd. A + * MEMCTX trigger from the adapter => the core builds the AUTHORITATIVE locator from the + * adapter snapshot (reg.describe) + stamps the epoch (single source of truth) and + * distributes to qualified subscribers with re-sharing of the RO-fd. The same path serves + * replay to a late subscriber. */ + +/* Build a MEMCTX delivery event for endpoint ep. segs are borrowed from the adapter's + * buffer (delivery is synchronous on the loop thread; ownership is not transferred). + * 1 — built. */ +static int core_memctx_build(vmsig_core* c, uint32_t ep, vmsig_event* ev) { + if (ep >= 64) return 0; + core_memctx_cell* cell = &c->memctx[ep]; + if (!cell->registered || !cell->reg.describe) return 0; + + vmsig_memctx pod; + memset(&pod, 0, sizeof pod); + const vmsig_memseg* segs = NULL; + uint32_t nseg = 0; + cell->reg.describe(cell->reg.ctx, &pod, &segs, &nseg); + pod.epoch = c->epoch[ep]; /* core stamps the epoch */ + pod.nseg = nseg; + pod.flags |= VMSIG_MEMCTX_RDONLY; /* outward — always read-only */ + + memset(ev, 0, sizeof *ev); + ev->kind = VMSIG_EV_MEMCTX; ev->source = VMSIG_SRC_MEMCTX; ev->dir = VMSIG_DIR_UP; + ev->prio = VMSIG_PRIO_NORMAL; ev->endpoint = ep; + memcpy(ev->inln, &pod, sizeof pod); + ev->payload.data = (void*)segs; /* borrowed: owner is the adapter */ + ev->payload.len = (size_t)nseg * sizeof(vmsig_memseg); + ev->payload.codec = VMSIG_CODEC_MEMCTX; + ev->payload.flags = VMSIG_PL_BORROWED; + ev->payload.release = NULL; + return 1; +} + +/* Deliver MEMCTX to one qualified control: fresh RO-fd from reg.share_fd + * (socket -> cmsg, in-proc -> direct int), attach_memctx, close fd (the core does not own + * the fd). On success — audit MEMCTX_GRANTED. */ +static void core_memctx_deliver_one(vmsig_core* c, core_memctx_cell* cell, + core_control_ent* e, const vmsig_event* ev) { + if (!e->ops->attach_memctx) return; /* control does not accept MEMCTX */ + int fd = cell->reg.share_fd ? cell->reg.share_fd(cell->reg.ctx) : -1; + int r = e->ops->attach_memctx(e->ctl, ev, fd); + if (fd >= 0) close(fd); /* the core does not own the ro-fd */ + if (r == 0) { + vmsig_audit a = { VMSIG_AUDIT_MEMCTX_GRANTED, e->grant.principal, + ev->endpoint, 0, 0 }; + core_audit(c, &a); + } +} + +void core_memctx_route(vmsig_core* c, const vmsig_event* trigger) { + uint32_t ep = trigger->endpoint; + if (ep >= 64) return; + core_memctx_cell* cell = &c->memctx[ep]; + if (!cell->registered) return; + + vmsig_event ev; + if (!core_memctx_build(c, ep, &ev)) return; + + cell->valid = 1; /* epoch context published */ + cell->epoch = c->epoch[ep]; + + for (int i = 0; i < c->ncontrols; i++) { + core_control_ent* e = &c->controls[i]; + if (!e->active) continue; + if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev)) + core_memctx_deliver_one(c, cell, e, &ev); + } +} + +void core_memctx_replay(vmsig_core* c, int ctl_id) { + if (ctl_id < 0 || ctl_id >= c->ncontrols) return; + core_control_ent* e = &c->controls[ctl_id]; + if (!e->active) return; + for (uint32_t ep = 0; ep < 64; ep++) { + core_memctx_cell* cell = &c->memctx[ep]; + if (!cell->registered || !cell->valid) continue; + vmsig_event ev; + if (!core_memctx_build(c, ep, &ev)) continue; + if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev)) + core_memctx_deliver_one(c, cell, e, &ev); + } +} + +void core_epoch_bump(vmsig_core* c, uint32_t endpoint) { + if (endpoint >= 64) return; + c->epoch[endpoint]++; + core_memctx_cell* cell = &c->memctx[endpoint]; + cell->valid = 0; /* prior-epoch context is not replayed */ + + vmsig_event up; + memset(&up, 0, sizeof up); + up.kind = VMSIG_EV_MEMCTX_INVALIDATED; up.source = VMSIG_SRC_MEMCTX; + up.dir = VMSIG_DIR_UP; up.prio = VMSIG_PRIO_URGENT; up.endpoint = endpoint; + vmsig_memctx_inv inv = { endpoint, c->epoch[endpoint] }; + memcpy(up.inln, &inv, sizeof inv); + core_emit_up(c, &up); /* broadcast to holders (CAP_MEMCTX gate) */ + + /* request re-bootstrap from the adapter: it re-emits MEMCTX{epoch+1} when ready. */ + if (cell->registered && cell->reg.invalidate) + cell->reg.invalidate(cell->reg.ctx, c->epoch[endpoint]); +} + +/* UP: drain the context queue and dispatch to subscribed controls */ +static void pump_up(vmsig_core* c) { + vmsig_event ev; + while (vmsig_ctx_next(c->ctx, VMSIG_DIR_UP, &ev) == 1) { + if (ev.kind == VMSIG_EV_MEMCTX) { + /* Context trigger: the core builds the authoritative locator (adapter snapshot + * + epoch stamp) and distributes to qualified holders with re-sharing of the + * RO-fd. The trigger itself is NOT delivered as an ordinary event. */ + core_memctx_route(c, &ev); + vmsig_payload_release(&ev); /* inline trigger (release=NULL) — harmless */ + continue; + } + if (ev.kind == VMSIG_EV_VM_LIFECYCLE && ev.origin == 0) { + /* Epoch-transition observation: a destructive async transition (VMHOST + * broadcast) invalidates the address-space context. NOT continue — VM_LIFECYCLE + * still goes to subscribers below via the normal broadcast. */ + const vmsig_vm_state* vs = (const vmsig_vm_state*)ev.inln; + if (vs->state == VMSIG_VM_RESET || vs->state == VMSIG_VM_POWERDOWN || + vs->state == VMSIG_VM_SHUTDOWN) + core_epoch_bump(c, ev.endpoint); + } + if (ev.origin) { + /* addressed reply ONLY to the initiator (origin+generation). The command was + * already authorized by the grant => we deliver the reply without re-check; if + * the initiator is gone/slot reused — we drop (private data, not broadcast). */ + core_control_ent* e = origin_ctl(c, ev.origin); + if (e && e->ops->deliver) e->ops->deliver(e->ctl, &ev); + } else { + /* unaddressed event — broadcast; effective = grant ∩ sub */ + for (int i = 0; i < c->ncontrols; i++) { + core_control_ent* e = &c->controls[i]; + if (!e->active) continue; + if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev) && e->ops->deliver) + e->ops->deliver(e->ctl, &ev); + } + } + vmsig_payload_release(&ev); + } +} + +/* DOWN: drain the queue and route the command to the adapter (source+endpoint) */ +static void pump_down(vmsig_core* c) { + vmsig_event ev; + while (vmsig_ctx_next(c->ctx, VMSIG_DIR_DOWN, &ev) == 1) { + core_control_ent* oe = origin_ctl(c, ev.origin); /* command has left ctx */ + if (oe && oe->pending) oe->pending--; /* THE ONLY decrement */ + + /* In-flight fencing: destruction whose origin is NO LONGER the class owner (lease + * lost between the emit_down gate and dequeue) is dropped BEFORE actuation. Does + * NOT finalize (finalization is done by acquire/reap) — else a double key-up. + * pending is NOT touched here (already decremented above). */ + int cls = lease_class_for_down(&ev); + if (cls >= 0 && ev.endpoint < 64 && c->lease[ev.endpoint][cls].owner != ev.origin) { + /* dropping destruction that lost the lease is observable (origin owner's principal). */ + vmsig_audit a = { VMSIG_AUDIT_LEASE_DENIED, lease_owner_principal(c, ev.origin), + ev.endpoint, (uint32_t)ev.kind, (uint32_t)cls }; + core_audit(c, &a); + vmsig_payload_release(&ev); + continue; + } + + core_adapter_ent* e = core_find_adapter(c, ev.endpoint, ev.source); + if (e && e->ops->submit) e->ops->submit(e->a, &ev); + vmsig_payload_release(&ev); + } +} + +/* Deferred reap of detached controls: after the batch (safe — not inside their own + * on_readable). epoll DEL + mark slot dead + ops->close. */ +static void core_reap(vmsig_core* c) { + for (int i = 0; i < c->ncontrols; i++) { + core_control_ent* e = &c->controls[i]; + if (!e->reap || !e->active) continue; + if (e->slot) { + epoll_ctl(c->epfd, EPOLL_CTL_DEL, e->slot->fd, NULL); + e->slot->role = SLOT_DEAD; + } + core_lease_reap_control(c, i); /* return leases + finalize held BEFORE active=0 */ + if (e->ops->close) e->ops->close(e->ctl); + e->active = 0; + e->reap = 0; + } +} + +int vmsig_core_run(vmsig_core* c) { + if (!c) return -1; + struct epoll_event evs[VMSIG_MAX_EVENTS]; + while (!__atomic_load_n(&c->stopping, __ATOMIC_ACQUIRE)) { + int n = epoll_wait(c->epfd, evs, VMSIG_MAX_EVENTS, -1); + if (n < 0) { if (errno == EINTR) continue; return -1; } + for (int i = 0; i < n; i++) { + core_slot* s = (core_slot*)evs[i].data.ptr; + switch (s->role) { + case SLOT_WAKEUP: + drain_counter_fd(s->fd); /* stopping is checked in while */ + break; + case SLOT_ADAPTER: + if (s->ops->on_readiness) + s->ops->on_readiness(s->adapter, s->cookie, evs[i].events); + break; + case SLOT_CTX_TIMING: + drain_counter_fd(s->fd); + break; + case SLOT_CONTROL: + if (s->cops->on_readable) + s->cops->on_readable(s->ctl); + break; + case SLOT_SOURCE: + if (s->on_source) + s->on_source(s->source_user, evs[i].events); + break; + case SLOT_DEAD: + break; /* detached — ignore */ + } + } + pump_up(c); + pump_down(c); + core_reap(c); + } + return 0; +} + +void vmsig_core_stop(vmsig_core* c) { + if (!c) return; + __atomic_store_n(&c->stopping, 1, __ATOMIC_RELEASE); /* cross-thread stop signal */ + core_wake(c); +} diff --git a/src/ctx/ctx.c b/src/ctx/ctx.c new file mode 100644 index 0000000..ebe6899 --- /dev/null +++ b/src/ctx/ctx.c @@ -0,0 +1,203 @@ +/* ctx.c — transfer context: priority, ordering, protocol timing. + * This is the SISC-critical seam. No behavioral timing here: commands arrive + * already decided by control; the context only orders and paces them. */ +#include "ctx_internal.h" +#include +#include +#include +#include +#include + +/* Default depth ceiling for a single band (per source,dir) when no policy is set. */ +#define VMSIG_CTX_DEFAULT_INFLIGHT 4096 + +static uint64_t now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +/* ---- node recycling (free-list under the shared mutex) ------------------- */ +static ev_node* node_get(vmsig_ctx* c) { + ev_node* n = c->freelist; + if (n) { c->freelist = n->next; return n; } + return malloc(sizeof *n); +} +static void node_put(vmsig_ctx* c, ev_node* n) { + n->next = c->freelist; + c->freelist = n; +} + +vmsig_ctx* vmsig_ctx_new(void) { + vmsig_ctx* c = calloc(1, sizeof *c); + if (!c) return NULL; + if (pthread_mutex_init(&c->lock, NULL) != 0) { free(c); return NULL; } + for (int d = 0; d < 2; d++) { + c->dir[d].timing_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); + if (c->dir[d].timing_fd < 0) { + for (int k = 0; k < d; k++) close(c->dir[k].timing_fd); + pthread_mutex_destroy(&c->lock); + free(c); + return NULL; + } + } + return c; +} + +void vmsig_ctx_free(vmsig_ctx* c) { + if (!c) return; + for (int d = 0; d < 2; d++) { + for (int p = 0; p < VMSIG_PRIO_MAX; p++) { + ev_node* n = c->dir[d].band[p].head; + while (n) { ev_node* nx = n->next; vmsig_payload_release(&n->ev); free(n); n = nx; } + } + if (c->dir[d].timing_fd >= 0) close(c->dir[d].timing_fd); + } + /* actually free the recycled nodes (no payload attached) */ + ev_node* f = c->freelist; + while (f) { ev_node* nx = f->next; free(f); f = nx; } + pthread_mutex_destroy(&c->lock); + free(c); +} + +int vmsig_ctx_set_policy(vmsig_ctx* c, vmsig_source src, vmsig_dir dir, + vmsig_prio default_prio, const vmsig_timing* t) { + if (!c || src >= VMSIG_SRC_MAX || dir > VMSIG_DIR_DOWN) return -1; + pthread_mutex_lock(&c->lock); + ctx_policy* pol = &c->policy[src][dir]; + pol->default_prio = default_prio; + if (t) pol->timing = *t; else memset(&pol->timing, 0, sizeof pol->timing); + pol->policy_set = 1; + pthread_mutex_unlock(&c->lock); + return 0; +} + +static void band_push_tail(ev_band* b, ev_node* n) { + n->next = NULL; + if (b->tail) b->tail->next = n; else b->head = n; + b->tail = n; + b->count++; +} + +int vmsig_ctx_submit(vmsig_ctx* c, vmsig_dir dir, vmsig_event* ev) { + if (!c || !ev || dir > VMSIG_DIR_DOWN) return -1; + vmsig_source src = ev->source < VMSIG_SRC_MAX ? ev->source : VMSIG_SRC_NONE; + + pthread_mutex_lock(&c->lock); + ctx_policy* pol = &c->policy[src][dir]; + + /* effective priority = max(policy default, emitter request) */ + vmsig_prio eff = ev->prio > pol->default_prio ? ev->prio : pol->default_prio; + if (eff >= VMSIG_PRIO_MAX) eff = VMSIG_PRIO_MAX - 1; + + ev->seq = ++c->seq; + if (ev->ts_ns == 0) ev->ts_ns = now_ns(); + ev->prio = eff; + + ev_band* band = &c->dir[dir].band[eff]; + + /* coalescing: a burst of the same kind+endpoint is collapsed (newest wins) */ + if (pol->timing.coalesce_ns) { + for (ev_node* n = band->head; n; n = n->next) { + if (n->ev.kind == ev->kind && n->ev.endpoint == ev->endpoint) { + vmsig_payload_release(&n->ev); + uint32_t keep_seq = n->ev.seq; /* keep position in the order */ + n->ev = *ev; + n->ev.seq = keep_seq; + pthread_mutex_unlock(&c->lock); + return 1; + } + } + } + + /* backpressure: channel depth is bounded. When no policy is set + * (max_inflight==0), a BUILT-IN default ceiling applies (drop newest), + * so the queue does not grow without bound under a command flood. */ + uint32_t cap = pol->timing.max_inflight ? pol->timing.max_inflight + : VMSIG_CTX_DEFAULT_INFLIGHT; + uint8_t dp = pol->timing.max_inflight ? pol->timing.drop_policy + : VMSIG_DROP_NEWEST; + if (band->count >= (int)cap) { + if (dp == VMSIG_DROP_OLDEST) { + ev_node* old = band->head; /* drop the oldest */ + if (old) { + band->head = old->next; + if (!band->head) band->tail = NULL; + band->count--; + vmsig_payload_release(&old->ev); + node_put(c, old); + } + } else { + /* NEWEST / BLOCK (the loop must not block) — drop the incoming event */ + vmsig_payload_release(ev); + pthread_mutex_unlock(&c->lock); + return 1; + } + } + + ev_node* node = node_get(c); + if (!node) { pthread_mutex_unlock(&c->lock); return -1; } + node->ev = *ev; /* take ownership of the payload */ + band_push_tail(band, node); + pthread_mutex_unlock(&c->lock); + return 0; +} + +int vmsig_ctx_next(vmsig_ctx* c, vmsig_dir dir, vmsig_event* out) { + if (!c || !out || dir > VMSIG_DIR_DOWN) return -1; + pthread_mutex_lock(&c->lock); + ctx_dir* d = &c->dir[dir]; + uint64_t now = now_ns(); + uint64_t min_rem = 0; + int have_rem = 0; + + /* Walk bands from highest priority to lowest, and within a band from head + * to tail, returning the FIRST event "matured" against its protocol min_gap. + * A paced source thus waits without blocking ready events of other sources. + * Within one source the order is preserved (its earlier events come first). */ + for (int p = VMSIG_PRIO_MAX - 1; p >= 0; p--) { + ev_band* b = &d->band[p]; + ev_node* prev = NULL; + ev_node* n = b->head; + while (n) { + vmsig_source src = n->ev.source < VMSIG_SRC_MAX ? n->ev.source : VMSIG_SRC_NONE; + ctx_policy* pol = &c->policy[src][dir]; + int due = 1; + uint64_t rem = 0; + if (pol->timing.min_gap_ns) { + uint64_t due_at = pol->last_emit_ns + pol->timing.min_gap_ns; + if (now < due_at) { due = 0; rem = due_at - now; } + } + if (due) { + if (prev) prev->next = n->next; else b->head = n->next; + if (b->tail == n) b->tail = prev; + b->count--; + pol->last_emit_ns = now; + *out = n->ev; /* payload ownership -> caller */ + node_put(c, n); + pthread_mutex_unlock(&c->lock); + return 1; + } + if (!have_rem || rem < min_rem) { min_rem = rem; have_rem = 1; } + prev = n; + n = n->next; + } + } + + /* nothing matured: arm the timing-fd for the nearest due time (if any waiting) */ + if (have_rem) { + struct itimerspec its; + memset(&its, 0, sizeof its); + its.it_value.tv_sec = (time_t)(min_rem / 1000000000ull); + its.it_value.tv_nsec = (long)(min_rem % 1000000000ull); + if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0) its.it_value.tv_nsec = 1; + timerfd_settime(d->timing_fd, 0, &its, NULL); + } + pthread_mutex_unlock(&c->lock); + return 0; +} + +int vmsig_ctx_timing_fd(vmsig_ctx* c, vmsig_dir dir) { + if (!c || dir > VMSIG_DIR_DOWN) return -1; + return c->dir[dir].timing_fd; +} diff --git a/src/ctx/include/ctx_internal.h b/src/ctx/include/ctx_internal.h new file mode 100644 index 0000000..c387c29 --- /dev/null +++ b/src/ctx/include/ctx_internal.h @@ -0,0 +1,41 @@ +#ifndef VMSIG_CTX_INTERNAL_H +#define VMSIG_CTX_INTERNAL_H +#include "vmsig_ctx.h" +#include +#include + +/* Private internals of the transfer context. Priority bands are simple + * linked FIFOs (one node per event; recycling is a later optimization). */ + +typedef struct ev_node { + vmsig_event ev; + struct ev_node* next; +} ev_node; + +typedef struct { + ev_node* head; + ev_node* tail; + int count; +} ev_band; + +typedef struct { + vmsig_prio default_prio; + vmsig_timing timing; + uint64_t last_emit_ns; /* for protocol min_gap (per source,dir) */ + int policy_set; +} ctx_policy; + +typedef struct { + ev_band band[VMSIG_PRIO_MAX]; /* 4 priority bands */ + int timing_fd; /* pacing timerfd (created in ctx_new) */ +} ctx_dir; + +struct vmsig_ctx { + pthread_mutex_t lock; + uint32_t seq; + ev_node* freelist; /* ev_node recycling */ + ctx_dir dir[2]; /* indexed by VMSIG_DIR_UP/DOWN */ + ctx_policy policy[VMSIG_SRC_MAX][2]; /* [source][dir] */ +}; + +#endif /* VMSIG_CTX_INTERNAL_H */ diff --git a/src/test/test_authz.c b/src/test/test_authz.c new file mode 100644 index 0000000..12bf9ac --- /dev/null +++ b/src/test/test_authz.c @@ -0,0 +1,95 @@ +/* test_authz.c — authorization to receive the coherent address-space context (MEMCTX). + * Several co-resident controls on one endpoint; MEMCTX is multicast upward and reaches + * ONLY the qualified holder. We check the least-privilege matrix: + * GOOD (CAP_MEMCTX + source_mask MEMCTX + endpoint) -> receives MEMCTX; + * NOCAP (CAP_OBSERVE, no MEMCTX) -> does NOT receive (cap); + * NOSRC (CAP_MEMCTX, source_mask without MEMCTX bit) -> does NOT receive (source_mask); + * NOEP (CAP_MEMCTX, but endpoint outside the mask) -> does NOT receive (endpoint). + * A vmhost stub provides watchdog ticks (VM_LIFECYCLE) — a termination guarantee if + * MEMCTX somehow never arrives (then the asserts catch its absence). In-proc, under ASAN. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +#define EP 0u + +typedef struct { + vmsig_core* core; + const char* tag; + int memctx; /* how many MEMCTX this control received */ + int ticks; /* watchdog: VM_LIFECYCLE (only on GOOD) */ + int is_good; /* GOOD stops the loop on the first MEMCTX */ +} holder; + +static int on_ev(void* user, const vmsig_event* ev) { + holder* h = user; + if (ev->kind == VMSIG_EV_VM_LIFECYCLE) { + h->ticks++; + if (h->is_good && h->ticks > 20) vmsig_core_stop(h->core); /* failsafe */ + } + return 0; +} + +static int on_memctx(void* user, const vmsig_event* ev, int fd) { + holder* h = user; + (void)ev; (void)fd; + h->memctx++; + if (h->is_good) vmsig_core_stop(h->core); + return 0; +} + +static void add_holder(vmsig_core* core, holder* h, uint32_t cap, + uint32_t source_mask, uint64_t endpoint_mask) { + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = on_ev; cfg.on_memctx = on_memctx; cfg.user = h; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = 7; g.endpoint_mask = endpoint_mask; + g.source_mask = source_mask; g.cap_mask = cap; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); +} + +int main(void) { + printf("test_authz (memctx least-privilege)\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder good = { core, "GOOD", 0, 0, 1 }; + holder nocap = { core, "NOCAP", 0, 0, 0 }; + holder nosrc = { core, "NOSRC", 0, 0, 0 }; + holder noep = { core, "NOEP", 0, 0, 0 }; + + /* GOOD: CAP_MEMCTX (+OBSERVE for watchdog lifecycle ticks), source MEMCTX, ep0 -> receives. */ + add_holder(core, &good, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << EP); + /* NOCAP: OBSERVE only (no CAP_MEMCTX) -> deny by cap. */ + add_holder(core, &nocap, VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << EP); + /* NOSRC: CAP_MEMCTX, but source_mask without the MEMCTX bit -> deny by source_mask. */ + add_holder(core, &nosrc, VMSIG_CAP_MEMCTX, ~(1u << VMSIG_SRC_MEMCTX), 1ull << EP); + /* NOEP: CAP_MEMCTX, source ok, but endpoint outside the mask (ep1) -> deny by endpoint. */ + add_holder(core, &noep, VMSIG_CAP_MEMCTX, 0xFFFFFFFFu, 1ull << 1); + + /* watchdog lifecycle ticks + address-space context on one endpoint (stub). */ + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, EP) >= 0, "add vmhost adapter"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, EP) >= 0, "add memctx adapter"); + + int rc = vmsig_core_run(core); + printf(" rc=%d GOOD.memctx=%d NOCAP=%d NOSRC=%d NOEP=%d\n", + rc, good.memctx, nocap.memctx, nosrc.memctx, noep.memctx); + + CHECK(good.memctx >= 1, "GOOD receives MEMCTX (cap+source+endpoint)"); + CHECK(nocap.memctx == 0, "NOCAP does not receive (no CAP_MEMCTX)"); + CHECK(nosrc.memctx == 0, "NOSRC does not receive (source_mask without MEMCTX)"); + CHECK(noep.memctx == 0, "NOEP does not receive (endpoint outside mask)"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + printf("authz tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_ctx.c b/src/test/test_ctx.c new file mode 100644 index 0000000..a81c9a9 --- /dev/null +++ b/src/test/test_ctx.c @@ -0,0 +1,125 @@ +/* test_ctx.c — unit tests for the transfer context (public vmsig_ctx_* API): + * priority->seq, coalescing, backpressure (drop oldest/newest), protocol + * pacing via timing-fd. Links against libvmsig; run through ctest. */ +#include "vmsig.h" +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +/* Submit a DOWN event with the given prio/kind/endpoint/corr. */ +static int put(vmsig_ctx* c, vmsig_prio p, vmsig_kind k, uint32_t ep, uint32_t corr) { + vmsig_event e; + memset(&e, 0, sizeof e); + e.source = VMSIG_SRC_NONE; e.dir = VMSIG_DIR_DOWN; + e.prio = p; e.kind = k; e.endpoint = ep; e.corr = corr; + return vmsig_ctx_submit(c, VMSIG_DIR_DOWN, &e); +} + +/* ---- 1. priority first, then FIFO by seq within a band ------------------- */ +static void test_priority_seq(void) { + printf("test_priority_seq\n"); + vmsig_ctx* c = vmsig_ctx_new(); + put(c, VMSIG_PRIO_BULK, VMSIG_EV_CMD_INPUT, 0, 0xA); + put(c, VMSIG_PRIO_URGENT, VMSIG_EV_CMD_INPUT, 0, 0xB); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 0xC); + put(c, VMSIG_PRIO_BULK, VMSIG_EV_CMD_INPUT, 0, 0xD); + put(c, VMSIG_PRIO_URGENT, VMSIG_EV_CMD_INPUT, 0, 0xE); + uint32_t want[5] = { 0xB, 0xE, 0xC, 0xA, 0xD }; /* URGENT(seq) -> NORMAL -> BULK(seq) */ + vmsig_event o; + for (int i = 0; i < 5; i++) { + int r = vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o); + CHECK(r == 1, "next must return an event"); + CHECK(o.corr == want[i], "priority->seq order"); + } + CHECK(vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 0, "queue drained"); + vmsig_ctx_free(c); +} + +/* ---- 2. coalescing: a burst of one kind+endpoint collapses (newest wins) - */ +static void test_coalesce(void) { + printf("test_coalesce\n"); + vmsig_ctx* c = vmsig_ctx_new(); + vmsig_timing t; memset(&t, 0, sizeof t); t.coalesce_ns = 1; + vmsig_ctx_set_policy(c, VMSIG_SRC_NONE, VMSIG_DIR_DOWN, VMSIG_PRIO_NORMAL, &t); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_VM, 0, 0x11); + int r2 = put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_VM, 0, 0x22); /* should coalesce */ + CHECK(r2 == 1, "second submit coalesced (=1)"); + vmsig_event o; + CHECK(vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 1, "one event after coalescing"); + CHECK(o.corr == 0x22, "newest data after coalescing"); + CHECK(vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 0, "nothing more"); + vmsig_ctx_free(c); +} + +/* ---- 3. backpressure drop_oldest --------------------------------------- */ +static void test_backpressure_oldest(void) { + printf("test_backpressure_oldest\n"); + vmsig_ctx* c = vmsig_ctx_new(); + vmsig_timing t; memset(&t, 0, sizeof t); + t.max_inflight = 2; t.drop_policy = VMSIG_DROP_OLDEST; + vmsig_ctx_set_policy(c, VMSIG_SRC_NONE, VMSIG_DIR_DOWN, VMSIG_PRIO_NORMAL, &t); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 1); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 2); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 3); /* evicts 1 */ + vmsig_event o; + int got[8], n = 0; + while (vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 1) got[n++] = (int)o.corr; + CHECK(n == 2, "2 remain after drop_oldest"); + CHECK(n == 2 && got[0] == 2 && got[1] == 3, "oldest evicted (1)"); + vmsig_ctx_free(c); +} + +/* ---- 4. backpressure drop_newest --------------------------------------- */ +static void test_backpressure_newest(void) { + printf("test_backpressure_newest\n"); + vmsig_ctx* c = vmsig_ctx_new(); + vmsig_timing t; memset(&t, 0, sizeof t); + t.max_inflight = 2; t.drop_policy = VMSIG_DROP_NEWEST; + vmsig_ctx_set_policy(c, VMSIG_SRC_NONE, VMSIG_DIR_DOWN, VMSIG_PRIO_NORMAL, &t); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 1); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 2); + int r3 = put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 3); /* dropped */ + CHECK(r3 == 1, "third submit dropped (=1)"); + vmsig_event o; + int got[8], n = 0; + while (vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 1) got[n++] = (int)o.corr; + CHECK(n == 2 && got[0] == 1 && got[1] == 2, "newest dropped (3)"); + vmsig_ctx_free(c); +} + +/* ---- 5. protocol pacing via timing-fd ---------------------------------- */ +static void test_pacing(void) { + printf("test_pacing\n"); + vmsig_ctx* c = vmsig_ctx_new(); + vmsig_timing t; memset(&t, 0, sizeof t); + t.min_gap_ns = 20u * 1000000u; /* 20 ms gap */ + vmsig_ctx_set_policy(c, VMSIG_SRC_NONE, VMSIG_DIR_DOWN, VMSIG_PRIO_NORMAL, &t); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 1); + put(c, VMSIG_PRIO_NORMAL, VMSIG_EV_CMD_INPUT, 0, 2); + vmsig_event o; + CHECK(vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 1, "first delivered immediately"); + CHECK(vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 0, "second paced (0 for now)"); + int tfd = vmsig_ctx_timing_fd(c, VMSIG_DIR_DOWN); + CHECK(tfd >= 0, "timing-fd valid"); + struct pollfd pfd = { .fd = tfd, .events = POLLIN }; + int pr = poll(&pfd, 1, 1000); /* wait for it to fire, no longer than 1s */ + CHECK(pr == 1, "timing-fd became ready within the gap"); + CHECK(vmsig_ctx_next(c, VMSIG_DIR_DOWN, &o) == 1, "after the gap the second matured"); + CHECK(o.corr == 2, "this is exactly the second event"); + vmsig_ctx_free(c); +} + +int main(void) { + test_priority_seq(); + test_coalesce(); + test_backpressure_oldest(); + test_backpressure_newest(); + test_pacing(); + printf("ctx tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_inputobs.c b/src/test/test_inputobs.c new file mode 100644 index 0000000..830e0ec --- /dev/null +++ b/src/test/test_inputobs.c @@ -0,0 +1,93 @@ +/* test_inputobs.c — input observation: + * held-query: a control with CAP_INPUT, on CMD_QUERY_INPUT, receives INPUT_HELD from the + * vmctl record (stub without vmctl => count=0); without CAP_INPUT — DOWN_DENIED. + * (The cursor sensor moved out of signaling with the FRAME adapter: CURSOR_STATE is now + * emitted by the out-of-repo vgpu-perception shell-as-control, not by a signaling adapter.) + * In-proc, under ASAN. Links against libvmsig. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(c, m) do { if (!(c)) { printf(" FAIL: %s\n", (m)); g_fail = 1; } } while (0) + +#define EP 0u + +typedef struct { + vmsig_core* core; + void* ctl; + int held; /* INPUT_HELD count */ + int last_held_count; + int stop_held; /* stop after N held replies (0=no) */ +} obs; + +static int on_ev(void* u, const vmsig_event* ev) { + obs* o = u; + if (ev->kind == VMSIG_EV_INPUT_HELD) { + const vmsig_input_held* h = (const vmsig_input_held*)ev->inln; + o->held++; o->last_held_count = (int)h->count; + if (o->stop_held && o->held >= o->stop_held) vmsig_core_stop(o->core); + } + return 0; +} + +static void add_ctl(vmsig_core* core, obs* o, uint32_t cap, uint32_t src_mask) { + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = on_ev; cfg.user = o; + cfg.sub.source_mask = src_mask; cfg.sub.prio_min = VMSIG_PRIO_BULK; + void* ctl = vmsig_inproc_control_new(&cfg); + o->ctl = ctl; + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = 1ull << EP; g.source_mask = src_mask; g.cap_mask = cap; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); +} + +static void send_query_input(void* ctl) { + vmsig_event d; memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_QUERY_INPUT; d.source = VMSIG_SRC_INPUT; d.dir = VMSIG_DIR_DOWN; + d.endpoint = EP; d.prio = VMSIG_PRIO_HIGH; + vmsig_inproc_send(ctl, &d); +} + +static int g_down_denied = 0; +static void audit_cb(void* ud, const vmsig_audit* a) { + (void)ud; if (a->kind == VMSIG_AUDIT_DOWN_DENIED) g_down_denied++; +} + +/* ---- held-query: CAP_INPUT -> INPUT_HELD (stub count=0); without cap -> DOWN_DENIED ---- */ +static void test_held_query(void) { + printf("test_held_query\n"); + g_down_denied = 0; + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + vmsig_core_set_audit(core, audit_cb, NULL); + vmsig_core_add_adapter(core, vmsig_input_ops(), NULL, EP); /* stub input (no vmctl) */ + + obs a; memset(&a, 0, sizeof a); a.core = core; a.stop_held = 1; + add_ctl(core, &a, VMSIG_CAP_INPUT, 0xFFFFFFFFu); + send_query_input(a.ctl); + vmsig_core_run(core); /* pump_down -> INPUT_HELD -> pump_up */ + CHECK(a.held == 1, "held: CAP_INPUT receives INPUT_HELD"); + CHECK(a.last_held_count == 0, "held: stub without vmctl -> count=0"); + + /* without CAP_INPUT (OBSERVE only): CMD_QUERY_INPUT rejected BEFORE ctx (synchronously). */ + obs b; memset(&b, 0, sizeof b); b.core = core; + add_ctl(core, &b, VMSIG_CAP_OBSERVE, 0xFFFFFFFFu); + int before = g_down_denied; + send_query_input(b.ctl); + CHECK(b.held == 0, "held: without CAP_INPUT -> no INPUT_HELD"); + CHECK(g_down_denied == before + 1, "held: without CAP_INPUT -> DOWN_DENIED"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +int main(void) { + printf("test_inputobs\n"); + test_held_query(); + printf("inputobs tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_lease.c b/src/test/test_lease.c new file mode 100644 index 0000000..e652db6 --- /dev/null +++ b/src/test/test_lease.c @@ -0,0 +1,525 @@ +/* test_lease.c — arbitration layer (exclusive lease of destructive resources). + * + * 13 cases from the contract (docs/plans/lease-arbitration.md §Tests). In-proc, + * runs without a live loop where synchronous interception suffices + * (ACQUIRE/RELEASE/STATUS and the lease gate are synchronous in core_emit_down); + * a live vmsig_core_run() — to check finalization/fencing/reclaim, where the + * input-adapter worker is involved (actuation on the worker thread, ACK via the loop). + * + * Observability of finalization ordering: the input-worker FIFO => ACT_ACK order == + * submit order. A synthesized key-up has origin=0 (broadcast), the owner's CMD_INPUT + * is an addressed ACK. A shared monotonic log records the relative ordering. */ +#include "vmsig.h" +#include "core_internal.h" /* core_request_drop: deterministic reclaim of an in-proc control */ +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +/* ---------- shared control infrastructure ---------- */ + +typedef struct { + int granted, denied, released, revoked, status; + int last_deny_reason; + int last_status_busy; + uint32_t last_status_owner; +} lease_counts; + +typedef struct { + void* core; + lease_counts cnt[4]; /* indexed by control */ + /* log of ACT_ACK (actuations) in arrival order: tag = corr (0 = synthesized up) */ + int ack_log[64]; + int nack; + int stop_after_acks; /* stop the loop after N actuations (0=not auto) */ + int total_replies; /* count of all lease UP events (GRANTED/DENIED/RELEASED/STATUS/REVOKED) */ + int stop_replies; /* stop the loop when total_replies>=this (0=not auto) */ + /* phase orchestration for finalization/fencing (on the loop thread via on_ev) */ + void* ctl_a; + void* ctl_b; + int phase; /* count of scenario phases passed */ + int scenario; /* 0=none, 1=preempt-finalize, 2=in-flight-fence */ +} lease_state; + +typedef struct { lease_state* s; int idx; } cref; + +/* Registry of allocated crefs (control user-data): inproc close() frees the + * ctl itself but not the user-data => we free them centrally at the end (ASAN cleanliness). */ +static cref* g_crefs[64]; +static int g_ncrefs = 0; +static cref* cref_new(lease_state* s, int idx) { + cref* r = calloc(1, sizeof *r); + r->s = s; r->idx = idx; + if (g_ncrefs < 64) g_crefs[g_ncrefs++] = r; + return r; +} +static void cref_free_all(void) { + for (int i = 0; i < g_ncrefs; i++) free(g_crefs[i]); + g_ncrefs = 0; +} + +/* forward declarations of send helpers (used in on_ev for phase orchestration) */ +static int acquire(void* ctl, uint32_t cls, uint32_t ep); +static int send_key(void* ctl, int code, int value, uint32_t corr, uint32_t ep); + +static void on_lease_ev(lease_state* s, int idx, const vmsig_event* ev) { + lease_counts* c = &s->cnt[idx]; + switch (ev->kind) { + case VMSIG_EV_LEASE_GRANTED: c->granted++; s->total_replies++; break; + case VMSIG_EV_LEASE_DENIED: { + c->denied++; + c->last_deny_reason = (int)((const vmsig_lease_req*)ev->inln)->reason; + s->total_replies++; + break; + } + case VMSIG_EV_LEASE_RELEASED: c->released++; s->total_replies++; break; + case VMSIG_EV_LEASE_REVOKED: c->revoked++; s->total_replies++; break; + case VMSIG_EV_LEASE_STATUS: { + c->status++; + const vmsig_lease_status* st = (const vmsig_lease_status*)ev->inln; + c->last_status_busy = (int)st->busy; + c->last_status_owner = st->owner_principal; + s->total_replies++; + break; + } + default: break; + } + if (s->stop_replies && s->total_replies >= s->stop_replies) + vmsig_core_stop(s->core); +} + +static int on_ev(void* user, const vmsig_event* ev) { + cref* r = user; lease_state* s = r->s; + on_lease_ev(s, r->idx, ev); + if (ev->kind == VMSIG_EV_ACT_ACK) { + if (s->nack < 64) s->ack_log[s->nack++] = (int)ev->corr; + + /* Phase orchestration (loop thread): wait for the REAL actuation of A's key-down + * (the held-set is filled in pump_down=in_submit), and only THEN let B preempt — + * otherwise finalization on a synchronous acquire would run over an empty held-set. */ + if (s->scenario == 1 && ev->corr == 11 && s->phase == 0) { + s->phase = 1; + acquire(s->ctl_b, VMSIG_LEASE_INPUT, 0); /* B preempts AFTER A's actuation */ + send_key(s->ctl_b, 31, 1, 22, 0); + send_key(s->ctl_b, 31, 0, 23, 0); + } + + if (s->stop_after_acks && s->nack >= s->stop_after_acks) + vmsig_core_stop(s->core); + } + return 0; +} + +/* ---------- DOWN send helpers ---------- */ + +static int send_lease(void* ctl, vmsig_kind kind, uint32_t cls, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = kind; d.source = VMSIG_SRC_INPUT; d.dir = VMSIG_DIR_DOWN; + d.endpoint = ep; d.prio = VMSIG_PRIO_HIGH; + vmsig_lease_req lr = { cls, 0 }; + memcpy(d.inln, &lr, sizeof lr); + return vmsig_inproc_send(ctl, &d); +} +static int acquire(void* ctl, uint32_t cls, uint32_t ep) { + return send_lease(ctl, VMSIG_EV_CMD_ACQUIRE, cls, ep); +} +static int release_(void* ctl, uint32_t cls, uint32_t ep) { + return send_lease(ctl, VMSIG_EV_CMD_RELEASE, cls, ep); +} +static int status(void* ctl, uint32_t cls, uint32_t ep) { + return send_lease(ctl, VMSIG_EV_CMD_LEASE_STATUS, cls, ep); +} + +/* CMD_INPUT: KEY/BTN with value, corr for tracking. */ +static int send_key(void* ctl, int code, int value, uint32_t corr, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_INPUT; d.source = VMSIG_SRC_INPUT; d.dir = VMSIG_DIR_DOWN; + d.endpoint = ep; d.prio = VMSIG_PRIO_HIGH; d.corr = corr; + vmsig_input in; memset(&in, 0, sizeof in); + in.kind = VMSIG_INPUT_KEY; in.code = code; in.value = value; + memcpy(d.inln, &in, sizeof in); + return vmsig_inproc_send(ctl, &d); +} + +static int send_life(void* ctl, int op, uint32_t corr, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_LIFECYCLE; d.source = VMSIG_SRC_INPUT; d.dir = VMSIG_DIR_DOWN; + d.endpoint = ep; d.prio = VMSIG_PRIO_URGENT; d.corr = corr; d.inln[0] = (uint8_t)op; + return vmsig_inproc_send(ctl, &d); +} + +/* Pump the loop until `n` more lease replies arrive (UP delivery via ctx + * requires pump_up). Lease DOWN intercepts are synchronous, but their UP replies are + * paced by the context => a live loop is needed. */ +static void pump_n(lease_state* s, int n) { + vmsig_core* c = (vmsig_core*)s->core; + c->stopping = 0; /* white-box: reuse the loop between phases */ + s->stop_replies = s->total_replies + n; + vmsig_core_run(c); + s->stop_replies = 0; +} + +static void* add_ctl(vmsig_core* core, lease_state* s, int idx, uint32_t cap, + uint32_t arb_prio, uint64_t epmask) { + cref* r = cref_new(s, idx); + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = on_ev; cfg.user = r; + cfg.sub.source_mask = 0xFFFFFFFFu; cfg.sub.prio_min = VMSIG_PRIO_BULK; + cfg.sub.endpoint_mask = 0; /* all VMs */ + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = epmask; g.source_mask = 0xFFFFFFFFu; + g.cap_mask = cap | VMSIG_CAP_OBSERVE; /* OBSERVE => sees the broadcast finalization ACT_ACK */ + g.arb_prio = arb_prio; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + return ctl; +} + +/* ===== Synchronous test group (no loop): ACQUIRE/RELEASE/STATUS interception ===== */ + +static void test_sync_group(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, 0, VMSIG_CAP_INPUT, 10, 1ull << 0); + void* B = add_ctl(core, &s, 1, VMSIG_CAP_INPUT, 100, 1ull << 0); + void* Lo= add_ctl(core, &s, 2, VMSIG_CAP_INPUT, 5, 1ull << 0); /* low priority */ + void* X = add_ctl(core, &s, 3, VMSIG_CAP_INPUT, 10, 1ull << 1); /* grant on ep1, not ep0 */ + + /* Lease DOWN intercepts are synchronous, but the UP reply is delivered via ctx => after + * each lease request we pump the loop until the corresponding UP arrives. */ + + /* 1) acquire -> GRANTED; the owner's CMD_INPUT passes the gate (==0). */ + acquire(A, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[0].granted == 1, "1: A gets GRANTED"); + CHECK(send_key(A, 30, 1, 1, 0) == 0, "1: owner's CMD_INPUT passes the gate"); + send_key(A, 30, 0, 2, 0); /* release, so as not to leave held for the next tests */ + + /* 2) gate: non-owner B -> CMD_INPUT dropped (-1). */ + CHECK(send_key(B, 30, 1, 3, 0) == -1, "2: non-owner: CMD_INPUT dropped by the gate"); + + /* 3) equal priorities: a contender of the same prio -> DENIED{HELD}. */ + { + void* C = add_ctl(core, &s, 1, VMSIG_CAP_INPUT, 10, 1ull << 0); (void)C; + int before = s.cnt[1].denied; + acquire(C, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[1].denied == before + 1, "3: equal prio -> DENIED"); + CHECK(s.cnt[1].last_deny_reason == VMSIG_LEASE_DENY_HELD, "3: reason=HELD"); + } + + /* 7) without cap -> NOCAP; foreign endpoint -> NOGRANT. */ + { + void* NC = add_ctl(core, &s, 2, 0u /* without INPUT */, 10, 1ull << 0); + int before = s.cnt[2].denied; + acquire(NC, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[2].denied == before + 1, "7: without cap -> DENIED"); + CHECK(s.cnt[2].last_deny_reason == VMSIG_LEASE_DENY_NOCAP, "7: reason=NOCAP"); + } + { + int before = s.cnt[3].denied; + acquire(X, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); /* X has a grant on ep1, requests ep0 */ + CHECK(s.cnt[3].denied == before + 1, "7: foreign endpoint -> DENIED"); + CHECK(s.cnt[3].last_deny_reason == VMSIG_LEASE_DENY_NOGRANT, "7: reason=NOGRANT"); + } + + /* 8) per-endpoint / per-class independence. */ + { + void* P0 = add_ctl(core, &s, 1, VMSIG_CAP_POWER, 50, 1ull << 0); + int gb = s.cnt[1].granted, gx = s.cnt[3].granted; + acquire(X, VMSIG_LEASE_INPUT, 1); pump_n(&s, 1); /* X on its own ep1 — free */ + CHECK(s.cnt[3].granted == gx + 1, "8: X owns INPUT@ep1 independently"); + acquire(P0, VMSIG_LEASE_POWER, 0); pump_n(&s, 1); /* POWER@ep0 is free, even though INPUT@ep0 is held by A */ + CHECK(s.cnt[1].granted == gb + 1, "8: POWER@ep0 independent of INPUT@ep0"); + } + + /* 11) STATUS: busy=1 while A holds INPUT@ep0. */ + { + int before = s.cnt[1].status; + status(B, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[1].status == before + 1, "11: STATUS replies"); + CHECK(s.cnt[1].last_status_busy == 1, "11: busy=1 while A owns"); + } + + /* 6) release -> reacquire: A releases, Lo (low prio) now gets it. */ + { + int rb = s.cnt[0].released; + release_(A, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[0].released == rb + 1, "6: A gets RELEASED"); + int gb = s.cnt[2].granted; + acquire(Lo, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); /* free -> even low prio takes it */ + CHECK(s.cnt[2].granted == gb + 1, "6: reacquire after release succeeds"); + status(B, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[1].last_status_busy == 1, "11: busy=1 after reacquire"); + release_(Lo, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + status(B, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[1].last_status_busy == 0, "11: busy=0 after releasing all"); + } + + /* 12) safe commands are NOT gated by the lease (nobody holds INPUT@ep0). */ + { + void* SAFE = add_ctl(core, &s, 1, VMSIG_CAP_LIFECYCLE | VMSIG_CAP_INPUT, 1, 1ull << 0); + /* PAUSE = safe lifecycle: lease_class_for_down -> -1 => not gated. */ + CHECK(send_life(SAFE, VMSIG_LIFE_PAUSE, 90, 0) == 0, + "12: safe lifecycle (PAUSE) is not gated by the lease"); + } + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ===== 13) policy seam: custom "always DENY" ===== */ +static vmsig_arb_decision policy_always_deny(void* ud, uint32_t ep, uint32_t cls, + const vmsig_grant* inc, const vmsig_grant* con) { + (void)ud; (void)ep; (void)cls; (void)inc; (void)con; + return VMSIG_ARB_DENY; +} +static void test_policy_group(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + vmsig_core_set_arb_policy(core, policy_always_deny, NULL); + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, 0, VMSIG_CAP_INPUT, 10, 1ull << 0); + void* B = add_ctl(core, &s, 1, VMSIG_CAP_INPUT, 999, 1ull << 0); /* highest prio */ + + acquire(A, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[0].granted == 1, "13: A owns"); + acquire(B, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); /* high prio, but policy=DENY */ + CHECK(s.cnt[1].granted == 0, "13: custom DENY => high prio does NOT preempt"); + CHECK(s.cnt[1].denied == 1, "13: B got DENIED"); + CHECK(s.cnt[0].revoked == 0, "13: A not revoked"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ===== 4) preemption: high prio takes the lease away (REVOKED to the old, GRANTED to the new). + * signaling does NOT release what is held (rolling back finalization is the control's decision). ===== */ +static void test_preempt(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, 0, VMSIG_CAP_INPUT, 10, 1ull << 0); + void* B = add_ctl(core, &s, 1, VMSIG_CAP_INPUT, 100, 1ull << 0); + vmsig_core_add_adapter(core, vmsig_input_ops(), NULL, 0); /* stub input */ + s.ctl_a = A; s.ctl_b = B; s.scenario = 1; + + acquire(A, VMSIG_LEASE_INPUT, 0); + send_key(A, 30, 1, 11, 0); /* A injects a KEY (corr=11) */ + /* B preempts from on_ev AFTER ack corr=11. There is NO finalization => wait for 3 actuations: + * A-down(11), B-down(22), B-up(23). */ + s.stop_after_acks = 3; + vmsig_core_run(core); + + CHECK(s.cnt[1].granted == 1, "4: B gets GRANTED on preemption"); + CHECK(s.cnt[0].revoked == 1, "4: A gets LEASE_REVOKED"); + int saw22 = 0; + for (int i = 0; i < s.nack; i++) if (s.ack_log[i] == 22) saw22 = 1; + CHECK(saw22, "4: the new owner's (B) input is actuated after preemption"); + /* in-flight fencing of the ex-owner — covered separately in test_inflight_fence. */ + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ===== reacquire by the owner with a key held down does NOT synthesize an up (self-preemption) ===== */ +static void test_self_reacquire(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, 0, VMSIG_CAP_INPUT, 10, 1ull << 0); + vmsig_core_add_adapter(core, vmsig_input_ops(), NULL, 0); + + acquire(A, VMSIG_LEASE_INPUT, 0); + send_key(A, 30, 1, 11, 0); /* hold down */ + acquire(A, VMSIG_LEASE_INPUT, 0); /* reacquire by the same owner -> idempotent */ + send_key(A, 31, 1, 22, 0); /* another key */ + send_key(A, 30, 0, 33, 0); + send_key(A, 31, 0, 44, 0); + + s.stop_after_acks = 4; /* there must be NO synthesized up (corr=0) */ + vmsig_core_run(core); + + CHECK(s.cnt[0].granted == 2, "self: repeated ACQUIRE -> another GRANTED"); + int saw_zero = 0; + for (int i = 0; i < s.nack; i++) if (s.ack_log[i] == 0) saw_zero = 1; + CHECK(!saw_zero, "self: reacquire by the owner does NOT synthesize a key-up"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ===== 9) reclaim-on-death: the slot is freed, RECLAIMED, B GRANTED ===== + * We model death via core_request_drop(id): core_reap -> core_lease_reap_control + * frees the owner's slot (held is NOT finalized — that's a vmctl write / control's decision). */ +static int g_reclaimed = 0; +static int g_lease_denied = 0; +static void audit_cb(void* ud, const vmsig_audit* a) { + (void)ud; + if (a->kind == VMSIG_AUDIT_LEASE_RECLAIMED) g_reclaimed++; + if (a->kind == VMSIG_AUDIT_LEASE_DENIED) g_lease_denied++; +} +/* On RECLAIMED we stop the loop (to end the reap run deterministically): ud=core. */ +static void reclaim_audit_cb(void* ud, const vmsig_audit* a) { + if (a->kind == VMSIG_AUDIT_LEASE_RECLAIMED) { + g_reclaimed++; + if (ud) vmsig_core_stop((vmsig_core*)ud); + } +} + +/* Variant of add_ctl that returns the control's id (via out). */ +static void* add_ctl_id(vmsig_core* core, lease_state* s, int idx, uint32_t cap, + uint32_t arb_prio, uint64_t epmask, int* out_id) { + cref* r = cref_new(s, idx); + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = on_ev; cfg.user = r; + cfg.sub.source_mask = 0xFFFFFFFFu; cfg.sub.prio_min = VMSIG_PRIO_BULK; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = epmask; g.source_mask = 0xFFFFFFFFu; + g.cap_mask = cap | VMSIG_CAP_OBSERVE; g.arb_prio = arb_prio; + int id = vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + if (out_id) *out_id = id; + return ctl; +} + +static void test_reclaim(void) { + g_reclaimed = 0; + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + vmsig_core_set_audit(core, reclaim_audit_cb, core); /* RECLAIMED -> stop the loop */ + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + int a_id = -1; + void* A = add_ctl_id(core, &s, 0, VMSIG_CAP_INPUT, 10, 1ull << 0, &a_id); + void* B = add_ctl_id(core, &s, 1, VMSIG_CAP_INPUT, 5, 1ull << 0, NULL); /* LOW prio */ + + acquire(A, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[0].granted == 1, "9: A owns"); + + /* A's death: reap frees its slot (RECLAIMED); the audit-cb stops the loop. */ + core_request_drop(core, a_id); + core->stopping = 0; /* white-box: reuse the loop (like pump_n) */ + vmsig_core_run(core); + CHECK(g_reclaimed == 1, "9: audit RECLAIMED on owner death"); + + /* the slot is free: B (low prio) takes it without preemption */ + acquire(B, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[1].granted == 1, "9: B GRANTED after reclaim (slot is free)"); + + (void)A; (void)B; + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ===== 10) in-flight fencing: losing the lease BEFORE pump_down -> drop ===== */ +/* A owns it, queues CMD_INPUT into ctx (via emit_down -> ctx), then B preempts + * SYNCHRONOUSLY (acquire does not go through ctx). By the time pump_down reaches A's CMD_INPUT, + * the owner is already B => the fence drops A's command (there must be NO actuation of A). */ +static void test_inflight_fence(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, 0, VMSIG_CAP_INPUT, 10, 1ull << 0); + void* B = add_ctl(core, &s, 1, VMSIG_CAP_INPUT, 100, 1ull << 0); + vmsig_core_add_adapter(core, vmsig_input_ops(), NULL, 0); + + acquire(A, VMSIG_LEASE_INPUT, 0); + /* A queues a command into ctx (corr=55) — it passes the gate (A owns it), lands in the DOWN queue */ + send_key(A, 30, 1, 55, 0); + /* B preempts SYNCHRONOUSLY (before the loop has called pump_down) */ + acquire(B, VMSIG_LEASE_INPUT, 0); + /* B sends its own command (corr=66) */ + send_key(B, 31, 1, 66, 0); + send_key(B, 31, 0, 67, 0); + + /* Expected actuations: finalization on preemption (corr=0, but A held nothing by the + * moment of preemption — A's down is still in ctx, the held-set is empty => finalize=0 ups), + * then B's 66 and 67. A's 55 MUST be dropped by the fence (no corr=55). */ + s.stop_after_acks = 2; /* B's down + up */ + vmsig_core_run(core); + + int saw55 = 0; + for (int i = 0; i < s.nack; i++) if (s.ack_log[i] == 55) saw55 = 1; + CHECK(!saw55, "10: in-flight ex-owner's command dropped by the fence"); + CHECK(s.cnt[1].granted == 1, "10: B owns after preemption"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ===== lease source gate + audit on acquire-deny ===== + * Regression: ACQUIRE is intercepted BEFORE grant_allows_down, so source_mask and + * audit of the deny paths must be checked/emitted IN THE lease layer ITSELF (otherwise: holding + * someone else's slot bypassing source_mask = DoS; audit-invisible probing of caps/endpoints). */ +static void* add_ctl_src(vmsig_core* core, lease_state* s, int idx, uint32_t cap, + uint64_t epmask, uint32_t source_mask) { + cref* r = cref_new(s, idx); + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = on_ev; cfg.user = r; cfg.sub.source_mask = 0xFFFFFFFFu; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = epmask; g.source_mask = source_mask; + g.cap_mask = cap | VMSIG_CAP_OBSERVE; g.arb_prio = 10; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + return ctl; +} +static void test_audit_and_source(void) { + g_lease_denied = 0; + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + vmsig_core_set_audit(core, audit_cb, NULL); + lease_state s; memset(&s, 0, sizeof s); s.core = core; + + /* cap=INPUT, but source_mask WITHOUT SRC_INPUT: the INPUT lease cannot be acquired (DoS bypass). */ + void* NS = add_ctl_src(core, &s, 0, VMSIG_CAP_INPUT, 1ull << 0, ~(1u << VMSIG_SRC_INPUT)); + int before = g_lease_denied; + acquire(NS, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[0].denied == 1, "src: acquire without the SRC_INPUT bit -> DENIED"); + CHECK(s.cnt[0].last_deny_reason == VMSIG_LEASE_DENY_NOGRANT, "src: reason=NOGRANT"); + CHECK(g_lease_denied == before + 1, "audit: source-deny emits LEASE_DENIED"); + + /* without cap: NOCAP + audit (previously acquire-deny was invisible to the audit). */ + void* NC = add_ctl_src(core, &s, 1, 0u, 1ull << 0, 0xFFFFFFFFu); + before = g_lease_denied; + acquire(NC, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[1].last_deny_reason == VMSIG_LEASE_DENY_NOCAP, "src: without cap -> NOCAP"); + CHECK(g_lease_denied == before + 1, "audit: NOCAP-deny emits LEASE_DENIED"); + + /* control case: cap+source -> GRANTED, without a spurious deny audit. */ + void* OK = add_ctl_src(core, &s, 2, VMSIG_CAP_INPUT, 1ull << 0, 0xFFFFFFFFu); + before = g_lease_denied; + acquire(OK, VMSIG_LEASE_INPUT, 0); pump_n(&s, 1); + CHECK(s.cnt[2].granted == 1, "src: cap+source -> GRANTED"); + CHECK(g_lease_denied == before, "audit: successful acquire does not emit a deny"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +int main(void) { + printf("test_lease\n"); + test_sync_group(); + test_policy_group(); + test_preempt(); + test_self_reacquire(); + test_inflight_fence(); + test_reclaim(); + test_audit_and_source(); + cref_free_all(); + + printf("lease tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_memctx.c b/src/test/test_memctx.c new file mode 100644 index 0000000..9780706 --- /dev/null +++ b/src/test/test_memctx.c @@ -0,0 +1,339 @@ +/* test_memctx.c — seam for the coherent address-space context (MEMCTX). + * + * 1) multicast + RO-fd + decode: a holder with CAP_MEMCTX receives MEMCTX, kcr3, + * epoch=0, nseg=1; the received fd mmaps PROT_READ, while PROT_WRITE -> EACCES + * (RO physically enforced); the vmsig_memctx_segs helper reconstructs segs[]; + * a co-resident holder without CAP_MEMCTX does NOT receive it (deny); + * 2) epoch: a synthetic destructive VM_LIFECYCLE -> MEMCTX_INVALIDATED -> + * re-multicast at epoch+1 with a NEW kcr3; + * 3) retain/replay: a LATE subscriber (attached AFTER publication) receives + * the retained MEMCTX with a valid re-shared RO-fd (synchronously on add_control); + * 4) multi-VM: two endpoints, isolation (a VM holder does not see another's context); + * 5) socket E2E: MEMCTX travels as a vmsig_wire frame + RO-fd in cmsg (SCM_RIGHTS), the + * client mmaps RO via the received fd. + * In-proc (except 5) and under ASAN. SISC: not a single control name in the adapter. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "vmsig_socket.h" /* vmsig_wire, vmsig_socket_attach */ +#include "core_internal.h" /* core_emit_up (synthetic lifecycle injection) */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +/* ===== in-proc holder ===== */ +typedef struct holder holder; +struct holder { + vmsig_core* core; + holder* peer; /* multi-VM: stop when both are ready (or NULL) */ + int is_driver; /* stops the loop on a condition */ + uint32_t expect_ep; + int memctx, invalidated, ticks, bad_ep; + uint64_t last_kcr3, kcr3_e0; + uint32_t last_epoch, last_nseg; + int ro_ok, rw_eacces, seg0_ok; + int inject_reset, injected; + int stop_epoch; /* stop when last_epoch>=stop_epoch (-1 = else) */ +}; + +static void maybe_stop(holder* h) { + if (!h->is_driver) return; + if (h->ticks > 30) { vmsig_core_stop(h->core); return; } /* failsafe (vmhost ticks) */ + if (h->stop_epoch >= 0) { + if ((int)h->last_epoch >= h->stop_epoch && h->memctx >= 1) vmsig_core_stop(h->core); + } else if (h->peer) { + if (h->memctx >= 1 && h->peer->memctx >= 1) vmsig_core_stop(h->core); + } else if (h->memctx >= 1) { + vmsig_core_stop(h->core); + } +} + +static int h_on_ev(void* u, const vmsig_event* ev) { + holder* h = u; + if (ev->kind == VMSIG_EV_VM_LIFECYCLE) h->ticks++; + else if (ev->kind == VMSIG_EV_MEMCTX_INVALIDATED) h->invalidated++; + maybe_stop(h); + return 0; +} + +static int h_on_memctx(void* u, const vmsig_event* ev, int fd) { + holder* h = u; + const vmsig_memctx* m = (const vmsig_memctx*)ev->inln; + h->memctx++; + if (ev->endpoint != h->expect_ep) h->bad_ep++; + h->last_kcr3 = m->kcr3; h->last_epoch = m->epoch; + if (m->epoch == 0) h->kcr3_e0 = m->kcr3; + + uint32_t n = 0; + const vmsig_memseg* segs = vmsig_memctx_segs(ev, &n); + h->last_nseg = n; + if (segs && n >= 1 && segs[0].gpa == 0 && segs[0].len == m->low) h->seg0_ok = 1; + + if (fd >= 0 && m->low) { + void* ro = mmap(NULL, (size_t)m->low, PROT_READ, MAP_SHARED, fd, 0); + if (ro != MAP_FAILED) { h->ro_ok = 1; munmap(ro, (size_t)m->low); } + void* rw = mmap(NULL, (size_t)m->low, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (rw == MAP_FAILED) h->rw_eacces = 1; else munmap(rw, (size_t)m->low); + } + + /* epoch test: on the first context (epoch0) inject a destructive transition. */ + if (h->inject_reset && !h->injected && m->epoch == 0) { + h->injected = 1; + vmsig_event lc; memset(&lc, 0, sizeof lc); + lc.kind = VMSIG_EV_VM_LIFECYCLE; lc.source = VMSIG_SRC_VMHOST; lc.dir = VMSIG_DIR_UP; + lc.prio = VMSIG_PRIO_URGENT; lc.endpoint = h->expect_ep; lc.origin = 0; + vmsig_vm_state vs = { VMSIG_VM_RESET, 0 }; + memcpy(lc.inln, &vs, sizeof vs); + core_emit_up(h->core, &lc); /* core: epoch++ + invalidate + re-multicast */ + } + maybe_stop(h); + return 0; +} + +static void add_holder(vmsig_core* core, holder* h, uint32_t cap, + uint32_t source_mask, uint64_t endpoint_mask) { + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = h_on_ev; cfg.on_memctx = h_on_memctx; cfg.user = h; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = 5; g.endpoint_mask = endpoint_mask; + g.source_mask = source_mask; g.cap_mask = cap; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); +} + +/* ---- 1. multicast + RO-fd + decode + deny ---------------------------------- */ +static void test_multicast(void) { + printf("test_multicast\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder good; memset(&good, 0, sizeof good); + good.core = core; good.is_driver = 1; good.expect_ep = 0; good.stop_epoch = -1; + holder deny; memset(&deny, 0, sizeof deny); + deny.core = core; deny.expect_ep = 0; deny.stop_epoch = -1; + + add_holder(core, &good, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + add_holder(core, &deny, VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); /* no MEMCTX */ + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx"); + + vmsig_core_run(core); + + CHECK(good.memctx >= 1, "GOOD received MEMCTX"); + CHECK(good.last_kcr3 != 0, "kcr3 nonzero"); + CHECK(good.last_epoch == 0, "first publication is epoch 0"); + CHECK(good.last_nseg == 1, "nseg=1 (single-low)"); + CHECK(good.seg0_ok, "segs[] decoded by the helper (gpa=0,len=low)"); + CHECK(good.ro_ok, "RO-fd: mmap(PROT_READ) ok"); + CHECK(good.rw_eacces, "RO-fd: mmap(PROT_WRITE) -> EACCES (RO enforced)"); + CHECK(good.bad_ep == 0, "delivery endpoint is correct"); + CHECK(deny.memctx == 0, "deny without CAP_MEMCTX does NOT receive MEMCTX"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ---- 2. epoch: invalidation + re-multicast epoch+1 ------------------------- */ +static void test_epoch(void) { + printf("test_epoch\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder h; memset(&h, 0, sizeof h); + h.core = core; h.is_driver = 1; h.expect_ep = 0; h.inject_reset = 1; h.stop_epoch = 1; + + add_holder(core, &h, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx"); + + vmsig_core_run(core); + + CHECK(h.memctx >= 2, "contexts for epochs 0 and 1 received"); + CHECK(h.invalidated >= 1, "MEMCTX_INVALIDATED delivered on epoch change"); + CHECK(h.last_epoch == 1, "re-multicast at epoch+1"); + CHECK(h.kcr3_e0 != 0 && h.last_kcr3 != 0 && h.last_kcr3 != h.kcr3_e0, + "new kcr3 after re-bootstrap (epoch 1 kcr3 != epoch 0)"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ---- 3. retain/replay to a late subscriber --------------------------------- */ +static void test_retain(void) { + printf("test_retain\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder a; memset(&a, 0, sizeof a); + a.core = core; a.is_driver = 1; a.expect_ep = 0; a.stop_epoch = -1; + + add_holder(core, &a, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost (watchdog)"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx"); + + vmsig_core_run(core); /* A receives MEMCTX, loop stopped */ + CHECK(a.memctx >= 1, "early subscriber A received MEMCTX"); + + /* LATE subscriber B: attaches AFTER publication. Replay of the retained context + * happens SYNCHRONOUSLY in add_control (cell valid) — without a second loop run. */ + holder b; memset(&b, 0, sizeof b); + b.core = core; b.expect_ep = 0; b.stop_epoch = -1; + add_holder(core, &b, VMSIG_CAP_MEMCTX, 0xFFFFFFFFu, 1ull << 0); + + CHECK(b.memctx >= 1, "late subscriber B received the retained MEMCTX (replay)"); + CHECK(b.last_kcr3 != 0, "B: kcr3 nonzero in the replay"); + CHECK(b.ro_ok, "B: re-shared RO-fd mmaps PROT_READ"); + CHECK(b.rw_eacces, "B: re-shared fd is RO (PROT_WRITE -> EACCES)"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ---- 4. multi-VM: endpoint isolation --------------------------------------- */ +static void test_multivm(void) { + printf("test_multivm\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder h0; memset(&h0, 0, sizeof h0); + holder h1; memset(&h1, 0, sizeof h1); + h0.core = core; h0.is_driver = 1; h0.expect_ep = 0; h0.stop_epoch = -1; h0.peer = &h1; + h1.core = core; h1.is_driver = 1; h1.expect_ep = 1; h1.stop_epoch = -1; h1.peer = &h0; + + /* each holder is scoped to its OWN endpoint (+OBSERVE for watchdog lifecycle ticks on ep0). */ + add_holder(core, &h0, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 0); + add_holder(core, &h1, VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE, 0xFFFFFFFFu, 1ull << 1); + + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost ep0 (watchdog)"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx ep0"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 1) >= 0, "add memctx ep1"); + + vmsig_core_run(core); + + CHECK(h0.memctx >= 1 && h0.bad_ep == 0, "VM0 receives ONLY its own context (ep0)"); + CHECK(h1.memctx >= 1 && h1.bad_ep == 0, "VM1 receives ONLY its own context (ep1)"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ---- 5. socket end-to-end: MEMCTX frame + fd in cmsg ----------------------- */ +#define SOCK_EP 3u +static vmsig_grant sock_policy(uint32_t uid, uint32_t pid, void* ud) { + (void)pid; (void)ud; + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = uid; g.endpoint_mask = 1ull << SOCK_EP; + g.source_mask = 0xFFFFFFFFu; g.cap_mask = VMSIG_CAP_MEMCTX; + return g; +} +static void* loop_main(void* p) { vmsig_core_run((vmsig_core*)p); return NULL; } + +static int connect_abstract(const char* name) { + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) return -1; + struct sockaddr_un a; memset(&a, 0, sizeof a); a.sun_family = AF_UNIX; + size_t n = strlen(name); + a.sun_path[0] = 0; + memcpy(a.sun_path + 1, name + 1, n - 1); + socklen_t alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n); + if (connect(fd, (struct sockaddr*)&a, alen) < 0) { close(fd); return -1; } + return fd; +} + +/* Read ONE 80-byte vmsig_wire frame; the adjacent fd (cmsg) -> into *out_fd. */ +static int recv_wire(int fd, vmsig_wire* w, int* out_fd) { + *out_fd = -1; + struct iovec iov = { .iov_base = w, .iov_len = sizeof *w }; + union { char buf[CMSG_SPACE(sizeof(int))]; struct cmsghdr a; } cm; + memset(&cm, 0, sizeof cm); + struct msghdr mh; memset(&mh, 0, sizeof mh); + mh.msg_iov = &iov; mh.msg_iovlen = 1; + mh.msg_control = cm.buf; mh.msg_controllen = sizeof cm.buf; + size_t got = 0; + while (got < sizeof *w) { + iov.iov_base = (char*)w + got; iov.iov_len = sizeof *w - got; + ssize_t n = recvmsg(fd, &mh, MSG_CMSG_CLOEXEC); + if (n <= 0) return (got == 0) ? 0 : -1; + for (struct cmsghdr* c = CMSG_FIRSTHDR(&mh); c; c = CMSG_NXTHDR(&mh, c)) + if (c->cmsg_level == SOL_SOCKET && c->cmsg_type == SCM_RIGHTS) + memcpy(out_fd, CMSG_DATA(c), sizeof(int)); + got += (size_t)n; + mh.msg_control = NULL; mh.msg_controllen = 0; /* fd only on the first recvmsg */ + } + return 1; +} + +static void test_socket(void) { + printf("test_socket\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, SOCK_EP) >= 0, "add memctx"); + const char* SOCK = "@vmsig-memctx-e2e"; + CHECK(vmsig_socket_attach(core, SOCK, sock_policy, NULL) == 0, "socket attach"); + + pthread_t th; pthread_create(&th, NULL, loop_main, core); + + int c = connect_abstract(SOCK); + CHECK(c >= 0, "client connected"); + if (c < 0) { vmsig_core_stop(core); pthread_join(th, NULL); vmsig_core_free(core); vmsig_ctx_free(ctx); return; } + + struct timeval tv = { .tv_sec = 3, .tv_usec = 0 }; + setsockopt(c, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv); + + int got_ctx = 0, ro_fd = -1, ro_ok = 0, rw_eacces = 0; + vmsig_memctx pod; memset(&pod, 0, sizeof pod); + for (int iter = 0; iter < 20 && !got_ctx; iter++) { + vmsig_wire w; int wfd = -1; + int r = recv_wire(c, &w, &wfd); + if (r != 1) break; + if (w.kind == VMSIG_EV_MEMCTX) { + got_ctx = 1; ro_fd = wfd; + memcpy(&pod, w.inln, sizeof pod); + CHECK(ro_fd >= 0, "MEMCTX frame carries an RO-fd in cmsg"); + CHECK(pod.flags & VMSIG_MEMCTX_RDONLY, "RDONLY flag is set"); + if (ro_fd >= 0 && pod.low) { + void* ro = mmap(NULL, (size_t)pod.low, PROT_READ, MAP_SHARED, ro_fd, 0); + if (ro != MAP_FAILED) { ro_ok = 1; munmap(ro, (size_t)pod.low); } + void* rw = mmap(NULL, (size_t)pod.low, PROT_READ | PROT_WRITE, MAP_SHARED, ro_fd, 0); + if (rw == MAP_FAILED) rw_eacces = 1; else munmap(rw, (size_t)pod.low); + } + } + } + CHECK(got_ctx == 1, "MEMCTX frame arrived over the socket (wire framing)"); + CHECK(ro_ok, "mmap RO via the received fd"); + CHECK(rw_eacces, "write-mmap via the received fd fails (RO)"); + + if (ro_fd >= 0) close(ro_fd); + close(c); + vmsig_core_stop(core); + pthread_join(th, NULL); + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +int main(void) { + test_multicast(); + test_epoch(); + test_retain(); + test_multivm(); + test_socket(); + printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_memwrite.c b/src/test/test_memwrite.c new file mode 100644 index 0000000..420a45f --- /dev/null +++ b/src/test/test_memwrite.c @@ -0,0 +1,227 @@ +/* test_memwrite.c — write-signaled seam (MEMWRITE): atomic guest-memory write under an + * exclusive lease. Stub mode (no VM): proves the full path cap -> grant -> lease-gate -> + * route -> adapter -> ACT_ACK without actuation, plus the default-deny and fencing + * invariants. The adapter never sees a control name (SISC). + * + * 1) happy path: CAP_MEMWRITE + a MEMWRITE lease -> CMD_MEMWRITE -> ACT_ACK{ok=1}; + * 2) extent default-deny: len > VMSIG_MEMWRITE_MAX and a missing SRC flag -> ACK{ok=0}; + * 3) lease gate: CMD_MEMWRITE WITHOUT an acquired lease -> dropped at the gate (no ACK); + * 4) cap gate: a control WITHOUT CAP_MEMWRITE cannot acquire the lease (DENIED{NOCAP}); + * 5) in-flight fence: A holds the lease, queues a write, B preempts SYNCHRONOUSLY -> A's + * queued write is dropped by the fence (no ACK for A's corr), B's write actuates. + * In-proc, under ASAN. */ +#include "vmsig.h" +#include "memctx.h" /* VMSIG_MEMWRITE_MAX: the adapter's extent bound (private) */ +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +/* ---- in-proc control: records lease replies + MEMWRITE ACKs ---- */ +typedef struct { + void* core; + int granted, denied, last_deny_reason; + int ack_ok[64]; /* ok flag per ACK in arrival order */ + uint32_t ack_corr[64]; /* corr per ACK */ + int nack; + int stop_replies, replies; /* stop the loop after N lease replies (0=off) */ + int stop_acks; /* stop the loop after N acks (0=off) */ +} cstate; + +typedef struct { cstate* s; } cref; +static cref* g_refs[16]; static int g_nrefs = 0; +static cref* cref_new(cstate* s) { + cref* r = calloc(1, sizeof *r); r->s = s; + if (g_nrefs < 16) g_refs[g_nrefs++] = r; + return r; +} +static void cref_free_all(void) { for (int i = 0; i < g_nrefs; i++) free(g_refs[i]); g_nrefs = 0; } + +static int on_ev(void* user, const vmsig_event* ev) { + cref* r = user; cstate* s = r->s; + switch (ev->kind) { + case VMSIG_EV_LEASE_GRANTED: s->granted++; s->replies++; break; + case VMSIG_EV_LEASE_DENIED: + s->denied++; + s->last_deny_reason = (int)((const vmsig_lease_req*)ev->inln)->reason; + s->replies++; + break; + case VMSIG_EV_ACT_ACK: + if (s->nack < 64) { + /* inln layout from mc_memwrite_ack: {int ok; uint32_t corr; uint32_t origin}. */ + int ok; memcpy(&ok, ev->inln, sizeof ok); + s->ack_ok[s->nack] = ok; + s->ack_corr[s->nack] = ev->corr; + s->nack++; + } + break; + default: break; + } + if (s->stop_replies && s->replies >= s->stop_replies) vmsig_core_stop(s->core); + if (s->stop_acks && s->nack >= s->stop_acks) vmsig_core_stop(s->core); + return 0; +} + +static void* add_ctl(vmsig_core* core, cstate* s, uint32_t cap, uint32_t arb_prio) { + cref* r = cref_new(s); + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = on_ev; cfg.user = r; + cfg.sub.source_mask = 0xFFFFFFFFu; cfg.sub.prio_min = VMSIG_PRIO_BULK; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = 1ull << 0; g.source_mask = 0xFFFFFFFFu; + g.cap_mask = cap | VMSIG_CAP_OBSERVE; g.arb_prio = arb_prio; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + return ctl; +} + +/* ---- DOWN send helpers ---- */ +static int acquire_mw(void* ctl) { + vmsig_event d; memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_ACQUIRE; d.source = VMSIG_SRC_MEMCTX; d.dir = VMSIG_DIR_DOWN; + d.endpoint = 0; d.prio = VMSIG_PRIO_HIGH; + vmsig_lease_req lr = { VMSIG_LEASE_MEMWRITE, 0 }; + memcpy(d.inln, &lr, sizeof lr); + return vmsig_inproc_send(ctl, &d); +} + +/* CMD_MEMWRITE with inline SRC; corr for tracking. flags: VMSIG_MW_SRC_* (0 => no SRC). */ +static int send_write(void* ctl, uint64_t gva, uint32_t len, uint32_t flags, + const void* src, uint32_t corr) { + vmsig_event d; memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_MEMWRITE; d.source = VMSIG_SRC_MEMCTX; d.dir = VMSIG_DIR_DOWN; + d.endpoint = 0; d.prio = VMSIG_PRIO_HIGH; d.corr = corr; + vmsig_memwrite mw = { gva, len, flags }; + memcpy(d.inln, &mw, sizeof mw); + if ((flags & VMSIG_MW_SRC_INLINE) && src && len <= VMSIG_MEMWRITE_INLINE) + memcpy(d.inln + sizeof mw, src, len); + return vmsig_inproc_send(ctl, &d); +} + +/* Run the loop until N acks (used after queuing actuated writes). */ +static void run_until_acks(cstate* s, int n) { + vmsig_core* c = (vmsig_core*)s->core; + s->stop_acks = n; s->stop_replies = 0; + vmsig_core_run(c); + s->stop_acks = 0; +} + +/* ---- 1+2+3: happy path, extent default-deny, lease gate -------------------- */ +static void test_path_and_deny(void) { + printf("test_path_and_deny\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + cstate s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, VMSIG_CAP_MEMWRITE, 10); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx"); + + /* 3) lease gate: without ACQUIRE the write is dropped at the gate (-1, no actuation). */ + uint8_t pat[8] = { 0xDE, 0xAD, 0xBE, 0xEF, 1, 2, 3, 4 }; + CHECK(send_write(A, 0x1000, 8, VMSIG_MW_SRC_INLINE, pat, 99) == -1, + "3: CMD_MEMWRITE without a lease is dropped by the gate"); + + /* acquire the MEMWRITE lease (synchronous intercept; UP reply paced by ctx). */ + CHECK(acquire_mw(A) == 0, "acquire submitted"); + + /* 1) happy path: inline write -> queued -> ACT_ACK{ok=1}. Also drains the GRANTED reply. */ + CHECK(send_write(A, 0x1000, 8, VMSIG_MW_SRC_INLINE, pat, 11) == 0, + "1: owner's CMD_MEMWRITE passes the gate"); + + /* 2) extent: len > MAX -> ACK{ok=0}, NOT actuated (queued ack on the loop thread). */ + CHECK(send_write(A, 0x2000, VMSIG_MEMWRITE_MAX + 1, VMSIG_MW_SRC_INLINE, pat, 22) == 0, + "2: over-extent write is accepted by the gate (denied inside the adapter)"); + /* 2b) missing SRC flag -> ACK{ok=0}. */ + CHECK(send_write(A, 0x3000, 4, 0u, NULL, 33) == 0, + "2b: no-SRC-flag write is accepted by the gate (denied inside the adapter)"); + + /* expect 3 ACKs (corr 11/22/33) + the GRANTED reply. */ + run_until_acks(&s, 3); + + CHECK(s.granted == 1, "lease GRANTED once"); + int saw11_ok = -1, saw22_ok = -1, saw33_ok = -1, saw99 = 0; + for (int i = 0; i < s.nack; i++) { + if (s.ack_corr[i] == 11) saw11_ok = s.ack_ok[i]; + if (s.ack_corr[i] == 22) saw22_ok = s.ack_ok[i]; + if (s.ack_corr[i] == 33) saw33_ok = s.ack_ok[i]; + if (s.ack_corr[i] == 99) saw99 = 1; + } + CHECK(saw11_ok == 1, "1: happy-path write ACKs ok=1 (stub)"); + CHECK(saw22_ok == 0, "2: over-extent write ACKs ok=0 (default-deny)"); + CHECK(saw33_ok == 0, "2b: no-SRC-flag write ACKs ok=0 (default-deny)"); + CHECK(!saw99, "3: the gate-dropped write produced no ACK"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ---- 4: cap gate — no CAP_MEMWRITE cannot acquire the lease ----------------- */ +static void test_cap_gate(void) { + printf("test_cap_gate\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + cstate s; memset(&s, 0, sizeof s); s.core = core; + + void* NC = add_ctl(core, &s, 0u /* no MEMWRITE */, 10); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx"); + + CHECK(acquire_mw(NC) == 0, "acquire submitted"); + s.stop_replies = 1; vmsig_core_run(core); s.stop_replies = 0; + + CHECK(s.denied == 1, "4: acquire without CAP_MEMWRITE -> DENIED"); + CHECK(s.last_deny_reason == VMSIG_LEASE_DENY_NOCAP, "4: reason=NOCAP"); + CHECK(s.granted == 0, "4: not granted"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +/* ---- 5: in-flight fence — losing the lease before pump_down drops the write -- */ +static void test_inflight_fence(void) { + printf("test_inflight_fence\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + cstate s; memset(&s, 0, sizeof s); s.core = core; + + void* A = add_ctl(core, &s, VMSIG_CAP_MEMWRITE, 10); + void* B = add_ctl(core, &s, VMSIG_CAP_MEMWRITE, 100); /* higher prio: preempts */ + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx"); + + uint8_t pat[4] = { 1, 2, 3, 4 }; + CHECK(acquire_mw(A) == 0, "A acquires"); + /* A queues a write (corr=55): passes the gate (A owns), lands in the DOWN queue. */ + CHECK(send_write(A, 0x1000, 4, VMSIG_MW_SRC_INLINE, pat, 55) == 0, "A queues write 55"); + /* B preempts SYNCHRONOUSLY (acquire does not go through ctx). */ + CHECK(acquire_mw(B) == 0, "B preempts"); + /* B's own write (corr=66) — should actuate. */ + CHECK(send_write(B, 0x2000, 4, VMSIG_MW_SRC_INLINE, pat, 66) == 0, "B queues write 66"); + + run_until_acks(&s, 1); /* B's 66 acks; A's 55 must be fenced (no ack) */ + + int saw55 = 0, saw66 = 0; + for (int i = 0; i < s.nack; i++) { + if (s.ack_corr[i] == 55) saw55 = 1; + if (s.ack_corr[i] == 66) saw66 = 1; + } + CHECK(!saw55, "5: ex-owner A's in-flight write is dropped by the fence"); + CHECK(saw66, "5: new owner B's write actuates after preemption"); + CHECK(s.granted == 2, "5: A and B each got GRANTED"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +int main(void) { + printf("test_memwrite\n"); + test_path_and_deny(); + test_cap_gate(); + test_inflight_fence(); + cref_free_all(); + printf("memwrite tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_mvm.c b/src/test/test_mvm.c new file mode 100644 index 0000000..ff11be0 --- /dev/null +++ b/src/test/test_mvm.c @@ -0,0 +1,62 @@ +/* test_mvm.c — mode A (single core, multiple VMs): per-endpoint multiplexing and + * per-VM grant scoping on UP delivery. Two vmhost endpoints in one core (each stub + * ticks VM_LIFECYCLE per endpoint); a poller granted only VM0 must see only ep0 + * lifecycle events, the VM1 poller — only ep1. */ +#include "vmsig.h" +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +typedef struct { vmsig_core* core; int seen[2][2]; } mvm; /* seen[control][endpoint] */ +typedef struct { mvm* d; int which; } ctlref; + +static int on_ev(void* user, const vmsig_event* ev) { + ctlref* r = user; mvm* d = r->d; + if (ev->kind == VMSIG_EV_VM_LIFECYCLE && ev->endpoint < 2) + d->seen[r->which][ev->endpoint]++; + if (d->seen[0][0] >= 2 && d->seen[1][1] >= 2) vmsig_core_stop(d->core); + return 0; +} + +int main(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + mvm d; memset(&d, 0, sizeof d); d.core = core; + ctlref r0 = { &d, 0 }, r1 = { &d, 1 }; + + vmsig_inproc_cfg c0; memset(&c0, 0, sizeof c0); c0.on_event = on_ev; c0.user = &r0; + vmsig_inproc_cfg c1; memset(&c1, 0, sizeof c1); c1.on_event = on_ev; c1.user = &r1; + void* ctl0 = vmsig_inproc_control_new(&c0); + void* ctl1 = vmsig_inproc_control_new(&c1); + + /* grants segregate the pollers per VM */ + vmsig_grant g0; memset(&g0, 0, sizeof g0); + g0.endpoint_mask = 1ull << 0; g0.source_mask = 0xFFFFFFFFu; g0.cap_mask = VMSIG_CAP_OBSERVE; + vmsig_grant g1; memset(&g1, 0, sizeof g1); + g1.endpoint_mask = 1ull << 1; g1.source_mask = 0xFFFFFFFFu; g1.cap_mask = VMSIG_CAP_OBSERVE; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl0, &g0); + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl1, &g1); + + /* two VMs in one core: a vmhost adapter per endpoint (stub ticks VM_LIFECYCLE) */ + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "VM0 adapter"); + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 1) >= 0, "VM1 adapter"); + + int rc = vmsig_core_run(core); + printf("test_mvm rc=%d c0[ep0=%d ep1=%d] c1[ep0=%d ep1=%d]\n", + rc, d.seen[0][0], d.seen[0][1], d.seen[1][0], d.seen[1][1]); + + CHECK(d.seen[0][0] >= 2, "control0 sees lifecycle of its own VM0"); + CHECK(d.seen[0][1] == 0, "control0 does NOT see VM1 (grant scoping)"); + CHECK(d.seen[1][1] >= 2, "control1 sees lifecycle of its own VM1"); + CHECK(d.seen[1][0] == 0, "control1 does NOT see VM0"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + printf("multi-vm tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_sec.c b/src/test/test_sec.c new file mode 100644 index 0000000..c1a082b --- /dev/null +++ b/src/test/test_sec.c @@ -0,0 +1,121 @@ +/* test_sec.c — security layer: grant enforcement on DOWN commands. + * Checks capability split (OBSERVE != INPUT != POWER != VM), source_mask + * on DOWN, destructive vs safe lifecycle/VM, foreign endpoint, default-deny. + * (Memory is no longer a DOWN command: the address-space context is multicast + * upward and gated by CAP_MEMCTX — see test_authz/test_memctx; here only DOWN + * actuation.) vmsig_inproc_send returns the result of core_emit_down (the grant of + * THIS specific control) — no need to run the loop. */ +#include "vmsig.h" +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +static int g_denied = 0; +static void audit_cb(void* ud, const vmsig_audit* a) { + (void)ud; + if (a->kind == VMSIG_AUDIT_DOWN_DENIED) g_denied++; +} + +/* DOWN command of kind on endpoint ep; source derived from kind */ +static int send(void* ctl, vmsig_kind kind, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = kind; d.dir = VMSIG_DIR_DOWN; d.endpoint = ep; d.prio = VMSIG_PRIO_NORMAL; + d.source = (kind == VMSIG_EV_CMD_INPUT || kind == VMSIG_EV_CMD_LIFECYCLE) ? VMSIG_SRC_INPUT + : VMSIG_SRC_VMHOST; + return vmsig_inproc_send(ctl, &d); +} +/* CMD_LIFECYCLE with a specific operation (code in inln[0]) */ +static int send_life(void* ctl, int op, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_LIFECYCLE; d.source = VMSIG_SRC_INPUT; d.dir = VMSIG_DIR_DOWN; + d.endpoint = ep; d.prio = VMSIG_PRIO_NORMAL; d.inln[0] = (uint8_t)op; + return vmsig_inproc_send(ctl, &d); +} +/* Acquire a lease of class cls on ep (destructive/input now requires a lease). */ +static int acq(void* ctl, uint32_t cls, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_ACQUIRE; d.source = VMSIG_SRC_INPUT; d.dir = VMSIG_DIR_DOWN; + d.endpoint = ep; d.prio = VMSIG_PRIO_HIGH; + vmsig_lease_req lr = { cls, 0 }; + memcpy(d.inln, &lr, sizeof lr); + return vmsig_inproc_send(ctl, &d); +} + +/* CMD_VM with an operation (vmsig_vm_cmd in inln) */ +static int send_vm(void* ctl, int op, uint32_t ep) { + vmsig_event d; + memset(&d, 0, sizeof d); + d.kind = VMSIG_EV_CMD_VM; d.source = VMSIG_SRC_VMHOST; d.dir = VMSIG_DIR_DOWN; + d.endpoint = ep; d.prio = VMSIG_PRIO_NORMAL; + vmsig_vm_cmd c = { (uint32_t)op }; + memcpy(d.inln, &c, sizeof c); + return vmsig_inproc_send(ctl, &d); +} + +static void* add_ctl(vmsig_core* core, uint32_t cap, uint32_t source_mask) { + vmsig_inproc_cfg cfg; + memset(&cfg, 0, sizeof cfg); /* on_event=NULL, sub=0: no UP needed */ + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = 1ull << 0; g.source_mask = source_mask; g.cap_mask = cap; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + return ctl; +} + +int main(void) { + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + vmsig_core_set_audit(core, audit_cb, NULL); + + void* A = add_ctl(core, VMSIG_CAP_OBSERVE, 0xFFFFFFFFu); /* screen observer */ + void* B = add_ctl(core, VMSIG_CAP_INPUT | VMSIG_CAP_LIFECYCLE, 0xFFFFFFFFu);/* input + safe lifecycle */ + void* P = add_ctl(core, VMSIG_CAP_POWER, 0xFFFFFFFFu); /* destructive power */ + void* S = add_ctl(core, VMSIG_CAP_INPUT, 1u << VMSIG_SRC_FRAME); /* INPUT, but source=FRAME */ + void* V = add_ctl(core, VMSIG_CAP_VM, 0xFFFFFFFFu); /* VM control (safe) */ + void* C = vmsig_inproc_control_new(&(vmsig_inproc_cfg){0}); /* default-deny */ + vmsig_core_add_control(core, vmsig_inproc_control_ops(), C, NULL); + + printf("test_security\n"); + /* A — screen observer: does NOT actuate input/lifecycle (split CAP) */ + CHECK(send(A, VMSIG_EV_CMD_INPUT, 0) == -1, "OBSERVE != input"); /* deny 1 */ + CHECK(send_life(A, VMSIG_LIFE_PAUSE, 0) == -1, "OBSERVE != lifecycle"); /* deny 2 */ + + /* B — input + SAFE lifecycle, but NOT destructive power. Destructive/input + * now passes ONLY while holding a class lease => ACQUIRE first. */ + acq(B, VMSIG_LEASE_INPUT, 0); + CHECK(send(B, VMSIG_EV_CMD_INPUT, 0) == 0, "INPUT => input allowed"); + CHECK(send_life(B, VMSIG_LIFE_PAUSE, 0) == 0, "LIFECYCLE => pause allowed"); + CHECK(send_life(B, VMSIG_LIFE_POWERDOWN, 0) == -1,"powerdown requires CAP_POWER"); /* deny 3 */ + + /* P — destructive power (with a POWER class lease) */ + acq(P, VMSIG_LEASE_POWER, 0); + CHECK(send_life(P, VMSIG_LIFE_POWERDOWN, 0) == 0, "POWER => powerdown allowed"); + + /* S — has INPUT, but source_mask lacks SRC_INPUT: DOWN input denied */ + CHECK(send(S, VMSIG_EV_CMD_INPUT, 0) == -1, "source_mask on DOWN: SRC_INPUT denied"); /* deny 4 */ + + /* V — VM control: safe ops yes, destructive ones require CAP_POWER */ + CHECK(send_vm(V, VMSIG_VMOP_CONT, 0) == 0, "CAP_VM => cont allowed"); + CHECK(send_vm(V, VMSIG_VMOP_POWERDOWN, 0) == -1, "VM powerdown requires CAP_POWER"); /* deny 5 */ + CHECK(send_vm(P, VMSIG_VMOP_POWERDOWN, 0) == 0, "CAP_POWER => VM powerdown allowed"); + + /* C — default-deny */ + CHECK(send_vm(C, VMSIG_VMOP_QUERY, 0) == -1, "default-deny is deaf"); /* deny 6 */ + + /* audit recorded all 6 DOWN denials */ + CHECK(g_denied == 6, "audit: all DOWN denials recorded"); + + vmsig_core_free(core); /* closes/frees all controls */ + vmsig_ctx_free(ctx); + + printf("security tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_sock.c b/src/test/test_sock.c new file mode 100644 index 0000000..d2bc412 --- /dev/null +++ b/src/test/test_sock.c @@ -0,0 +1,154 @@ +/* test_sock.c — out-of-process control: wire codec + authentication/admission. + * Bring up two listeners (one admitting, one rejecting) on abstract sockets, run + * the core in a separate thread, connect clients and check: policy invoked, + * valid poller admitted, unauthorized rejected (EOF), reap without a crash. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "vmsig_socket.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +static atomic_int g_auth = 0; +static atomic_int g_deny = 0; +static atomic_int g_admit = 0; +static atomic_int g_reject = 0; + +static void audit_cb(void* ud, const vmsig_audit* a) { + (void)ud; + if (a->kind == VMSIG_AUDIT_ADMIT) atomic_fetch_add(&g_admit, 1); + else if (a->kind == VMSIG_AUDIT_REJECT) atomic_fetch_add(&g_reject, 1); +} + +static vmsig_grant pol_ok(uint32_t uid, uint32_t pid, void* ud) { + (void)pid; (void)ud; + atomic_fetch_add(&g_auth, 1); + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = uid; g.endpoint_mask = 1u << 0; + g.source_mask = 0xFFFFFFFFu; g.cap_mask = VMSIG_CAP_OBSERVE; + return g; +} +static vmsig_grant pol_deny(uint32_t uid, uint32_t pid, void* ud) { + (void)uid; (void)pid; (void)ud; + atomic_fetch_add(&g_deny, 1); + vmsig_grant g; memset(&g, 0, sizeof g); /* empty => reject */ + return g; +} + +static int connect_abstract(const char* name) { + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) return -1; + struct sockaddr_un a; memset(&a, 0, sizeof a); a.sun_family = AF_UNIX; + size_t n = strlen(name); + a.sun_path[0] = 0; + memcpy(a.sun_path + 1, name + 1, n - 1); + socklen_t alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n); + if (connect(fd, (struct sockaddr*)&a, alen) < 0) { close(fd); return -1; } + return fd; +} + +static void* loop_main(void* p) { vmsig_core_run((vmsig_core*)p); return NULL; } + +static void wait_atomic(atomic_int* a, int want, int ms) { + for (int i = 0; i < ms; i++) { + if (atomic_load(a) >= want) return; + struct timespec t = { .tv_sec = 0, .tv_nsec = 1000000 }; + nanosleep(&t, NULL); + } +} + +static void test_wire(void) { + printf("test_wire\n"); + vmsig_event ev; memset(&ev, 0, sizeof ev); + ev.kind = VMSIG_EV_CMD_VM; ev.source = VMSIG_SRC_VMHOST; ev.dir = VMSIG_DIR_DOWN; + ev.prio = VMSIG_PRIO_HIGH; ev.endpoint = 0; ev.corr = 0xABCD; + for (int i = 0; i < 48; i++) ev.inln[i] = (uint8_t)i; + + vmsig_wire w; vmsig_wire_encode(&w, &ev); + vmsig_event d; + CHECK(vmsig_wire_decode(&w, &d) == 0, "decode ok"); + CHECK(d.kind == ev.kind && d.source == ev.source && + d.endpoint == ev.endpoint && d.corr == ev.corr, "frame fields match"); + CHECK(memcmp(d.inln, ev.inln, 48) == 0, "inln matches"); + + vmsig_wire bad = w; bad.magic = 0; vmsig_event x; + CHECK(vmsig_wire_decode(&bad, &x) == -1, "bad magic rejected"); +} + +int main(void) { + test_wire(); + + printf("test_socket\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + vmsig_core_set_audit(core, audit_cb, NULL); + const char* OK = "@vmsig-sock-ok-test"; + const char* DENY = "@vmsig-sock-deny-test"; + CHECK(vmsig_socket_attach(core, OK, pol_ok, NULL) == 0, "attach ok listener"); + CHECK(vmsig_socket_attach(core, DENY, pol_deny, NULL) == 0, "attach deny listener"); + + pthread_t th; + pthread_create(&th, NULL, loop_main, core); + + /* valid poller: connect -> policy -> admission */ + int c1 = connect_abstract(OK); + CHECK(c1 >= 0, "client connected (ok)"); + wait_atomic(&g_auth, 1, 1000); + CHECK(atomic_load(&g_auth) >= 1, "policy invoked — poller authenticated/admitted"); + if (c1 >= 0) close(c1); /* disconnect -> deferred reap (no crash) */ + + /* unauthorized: connect -> server closes -> EOF on the client */ + int c2 = connect_abstract(DENY); + CHECK(c2 >= 0, "client connected (deny)"); + wait_atomic(&g_deny, 1, 1000); + CHECK(atomic_load(&g_deny) >= 1, "deny policy invoked"); + if (c2 >= 0) { + struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; + setsockopt(c2, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv); + char b; ssize_t r = read(c2, &b, 1); + CHECK(r == 0, "connection rejected by server (EOF)"); + close(c2); + } + + /* slot reuse: churn > MAX_CONTROLS(64). Without returning slots the listener + * would die after 64 cycles. Each cycle: connect(ok) -> wait auth++ -> close. */ + int base = atomic_load(&g_auth); + const int churn = 70; + for (int k = 0; k < churn; k++) { + int fc = connect_abstract(OK); + if (fc < 0) { CHECK(0, "churn connect"); break; } + wait_atomic(&g_auth, base + k + 1, 1000); + close(fc); + struct timespec ts = { .tv_sec = 0, .tv_nsec = 2 * 1000000 }; + nanosleep(&ts, NULL); /* let the loop reap before the next connection */ + } + CHECK(atomic_load(&g_auth) >= base + churn, + "slots reused: churn > MAX_CONTROLS admitted"); + + /* audit recorded admissions and rejections */ + CHECK(atomic_load(&g_admit) >= 1, "audit: poller admission"); + CHECK(atomic_load(&g_reject) >= 1, "audit: rejection (deny listener)"); + + struct timespec t = { .tv_sec = 0, .tv_nsec = 50 * 1000000 }; + nanosleep(&t, NULL); /* let the loop process the reaps */ + vmsig_core_stop(core); + pthread_join(th, NULL); + vmsig_core_free(core); + vmsig_ctx_free(ctx); + + printf("socket tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_vmhost.c b/src/test/test_vmhost.c new file mode 100644 index 0000000..0a3490d --- /dev/null +++ b/src/test/test_vmhost.c @@ -0,0 +1,146 @@ +/* test_vmhost.c — QEMU/QMP host-plane, armed path: fake QMP server (this test) + * <-> real QMP client vmhost. We verify: handshake (greeting -> qmp_capabilities + * -> return -> SEAM_UP), async events -> VM_LIFECYCLE (broadcast), CMD_VM{QUERY} + * -> command to server -> return -> addressed VM_LIFECYCLE to the initiator, EOF -> SEAM_DOWN. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "vmhost.h" /* private cfg (CMake provides the include path) */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +static atomic_int g_seamup = 0, g_seamdown = 0; +static atomic_int g_paused = 0, g_running_bcast = 0, g_query_reply = 0; +static void* g_ctl = NULL; + +static int on_ev(void* user, const vmsig_event* ev) { + (void)user; + if (ev->kind == VMSIG_EV_SEAM_UP && ev->source == VMSIG_SRC_VMHOST) { + atomic_store(&g_seamup, 1); + vmsig_event d; memset(&d, 0, sizeof d); /* once ready — query status */ + d.kind = VMSIG_EV_CMD_VM; d.source = VMSIG_SRC_VMHOST; d.dir = VMSIG_DIR_DOWN; + d.prio = VMSIG_PRIO_NORMAL; d.endpoint = 0; d.corr = 0x55; + vmsig_vm_cmd c = { VMSIG_VMOP_QUERY }; memcpy(d.inln, &c, sizeof c); + vmsig_inproc_send(g_ctl, &d); + } else if (ev->kind == VMSIG_EV_SEAM_DOWN && ev->source == VMSIG_SRC_VMHOST) { + atomic_store(&g_seamdown, 1); + } else if (ev->kind == VMSIG_EV_VM_LIFECYCLE) { + vmsig_vm_state vs; memcpy(&vs, ev->inln, sizeof vs); + if (ev->origin) { /* addressed reply to our QUERY */ + if (vs.state == VMSIG_VM_RUNNING) atomic_store(&g_query_reply, 1); + } else { /* broadcast async event */ + if (vs.state == VMSIG_VM_PAUSED) atomic_store(&g_paused, 1); + if (vs.state == VMSIG_VM_RUNNING) atomic_store(&g_running_bcast, 1); + } + } + return 0; +} + +static void* loop_main(void* p) { vmsig_core_run((vmsig_core*)p); return NULL; } + +static int srv_listen(const char* name) { + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) return -1; + struct sockaddr_un a; memset(&a, 0, sizeof a); a.sun_family = AF_UNIX; + size_t n = strlen(name); + a.sun_path[0] = 0; memcpy(a.sun_path + 1, name + 1, n - 1); + socklen_t alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n); + if (bind(fd, (struct sockaddr*)&a, alen) < 0) { close(fd); return -1; } + if (listen(fd, 4) < 0) { close(fd); return -1; } + return fd; +} +static void srv_send(int fd, const char* s) { ssize_t r = write(fd, s, strlen(s)); (void)r; } +static int srv_expect(int fd, const char* needle) { + char buf[1024]; size_t len = 0; + for (int i = 0; i < 200; i++) { /* up to ~2s */ + ssize_t r = read(fd, buf + len, sizeof buf - 1 - len); + if (r > 0) { len += (size_t)r; buf[len] = 0; if (strstr(buf, needle)) return 1; } + else if (r == 0) return 0; + else { struct timespec t = { 0, 10 * 1000000 }; nanosleep(&t, NULL); } + if (len >= sizeof buf - 1) len = 0; + } + return 0; +} +static void wait_atomic(atomic_int* a, int ms) { + for (int i = 0; i < ms; i++) { + if (atomic_load(a)) return; + struct timespec t = { 0, 1000000 }; nanosleep(&t, NULL); + } +} + +int main(void) { + const char* QMP = "@vmsig-qmp-fake-test"; + int srv = srv_listen(QMP); + if (srv < 0) { printf("srv_listen failed\n"); return 1; } + + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + vmsig_inproc_cfg cc; memset(&cc, 0, sizeof cc); cc.on_event = on_ev; + void* ctl = vmsig_inproc_control_new(&cc); + g_ctl = ctl; + vmsig_grant g; memset(&g, 0, sizeof g); + g.endpoint_mask = 1ull << 0; g.source_mask = 0xFFFFFFFFu; + g.cap_mask = VMSIG_CAP_OBSERVE | VMSIG_CAP_VM; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + + /* armed vmhost: it will connect to our fake QMP */ + vmsig_vmhost_cfg vcfg; memset(&vcfg, 0, sizeof vcfg); vcfg.qmp_path = QMP; + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), &vcfg, 0) >= 0, "vmhost armed attach"); + + pthread_t th; pthread_create(&th, NULL, loop_main, core); + + /* === QMP server role === */ + int c = accept(srv, NULL, NULL); + CHECK(c >= 0, "server accepted vmhost connection"); + if (c >= 0) { + struct timeval tv = { 0, 50 * 1000 }; + setsockopt(c, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv); + + srv_send(c, "{\"QMP\": {\"version\": {}, \"capabilities\": []}}\r\n"); + CHECK(srv_expect(c, "qmp_capabilities"), "client sent qmp_capabilities"); + srv_send(c, "{\"return\": {}}\r\n"); /* -> READY -> SEAM_UP */ + + srv_send(c, "{\"event\": \"STOP\"}\r\n"); /* -> broadcast PAUSED */ + CHECK(srv_expect(c, "query-status"), "client sent query-status (from CMD_VM)"); + srv_send(c, "{\"return\": {\"status\": \"running\"}, \"id\": 1}\r\n"); /* -> addressed reply */ + srv_send(c, "{\"event\": \"RESUME\"}\r\n"); /* -> broadcast RUNNING */ + + wait_atomic(&g_seamup, 1000); + wait_atomic(&g_paused, 1000); + wait_atomic(&g_query_reply, 1000); + wait_atomic(&g_running_bcast, 1000); + + close(c); /* EOF -> SEAM_DOWN */ + wait_atomic(&g_seamdown, 1000); + } + + CHECK(atomic_load(&g_seamup), "handshake complete (SEAM_UP)"); + CHECK(atomic_load(&g_paused), "async STOP -> VM_LIFECYCLE PAUSED (broadcast)"); + CHECK(atomic_load(&g_query_reply), "CMD_VM QUERY -> addressed VM_LIFECYCLE RUNNING"); + CHECK(atomic_load(&g_running_bcast),"async RESUME -> VM_LIFECYCLE RUNNING (broadcast)"); + CHECK(atomic_load(&g_seamdown), "EOF QMP -> SEAM_DOWN"); + + vmsig_core_stop(core); + pthread_join(th, NULL); + vmsig_core_free(core); + vmsig_ctx_free(ctx); + close(srv); + + printf("vmhost tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +}